APPENDIX

Published

26 December 2025

This appendix includes all of the execution steps used in analyzing the data, from preprocessing with the raw public data (A0) to statistical analysis (A1 to A23).

In order to reproduce these steps, it is necessary to place all of the following files in the same directory as 000_analysis_script.qmd. Moreover, a revn.lock file and a Docker image were provided to ensure that the analysis can be executed within the same software environment used by the authors.

To conduct the analyses (A1 to A23) without running the preprocessing steps, place 999_clean_data.rds in the same directory, run the Setup section, and start at any Analyses section. Each analysis section is independent and can be run separately.

Setup

Set working directory

# Set the working directory to the current folder
if (requireNamespace("rstudioapi") && rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
} else {
  setwd(dirname(knitr::current_input()))
}

Load packages

if (!require("pacman")) install.packages("pacman")
pacman::p_load(char = c(
  # data import and cleaning
  "readr",
  "readxl",
  "qualtRics",
  "dplyr",
  "tidyr",
  "stringr",
  "purrr",
  "tibble",
  "forcats",
  "janitor",
  "labelled",
  "lubridate",

  # visualization
  "ggplot2",
  "MetBrewer",
  "scales",
  "ggridges",
  "ggtext",
  "ggfx",
  "ggh4x",
  "ggeffects",
  "cowplot",
  "grid",
  "gridExtra",
  "gtable",
  "ggplotify",
  "see",
  "corrplot",
  "visdat",
  "ggflags",
  "showtext",
  "sysfonts",

  # modeling and statistics
  "mgcv",
  "lme4",
  "survey",
  "metafor",
  "binom",
  "lsr",
  "car",
  "emmeans",
  "interactions",
  "rmcorr",
  "psych",
  "Hmisc",
  "weights",

  # SEM and reliability
  "lavaan",
  "semTools",

  # reporting and tables
  "broom.mixed",
  "kableExtra",
  "sjPlot",
  "flextable",
  "officer",
  "report",
  "performance",
  "reactable",

  # spatial data and maps
  "sf",
  "rnaturalearth",
  "rnaturalearthdata",
  "leaflet",
  "leaflet.extras",
  "leaflet.extras2",

  # utilities
  "countrycode",
  "htmltools",
  "rlang",
  "sessioninfo"
))

Define global settings

options(
  # Remove scientific notation
  scipen = 999,
  width = 150,
  # Clean up dplyr messages
  dplyr.summarise.inform = FALSE)

# Set up theme for plots
sysfonts::font_add_google("Inter")
showtext::showtext_auto()

theme_gmh <- 
  ggplot2::theme_minimal(base_family = "Inter", base_size = 12) +
  ggplot2::theme(
    text = element_text(family = "Inter", colour = "#051520"),
    axis.text.y  = element_text(color = "#051520"),
    axis.text.x  = element_text(
      color = "#051520",
      margin = margin(t = 1),
      face = "bold"
    ),
    axis.title.x = element_text(color = "#051520", face = "bold"),
    axis.title.y = element_text(color = "#051520", face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line.x = element_line(colour = "#051520", linewidth = 0.4),
    plot.margin = margin(6, 6, 6, 6),
    plot.subtitle = ggplot2::element_text(color = "#051520"),
    plot.background = ggplot2::element_rect(fill = "transparent", color = NA),
    panel.background = ggplot2::element_rect(fill = "transparent", color = NA),
    legend.background = ggplot2::element_rect(fill = "transparent", color = NA)
  )

ggplot2::theme_set(theme_gmh)

# Print variables in a tidy way
table_label <- function(col) {
  # extract what is after $ in dataframe$column
  name <- sub(".*\\$(.+)", "\\1", deparse(substitute(col)))
  # extract the label of the given column
  lab  <- attr(col, "label")
  # print header wih column name and label
  cat(sprintf("$%s\n%s\n", name, lab))
  # print table output with NA counts
  tbl <- table(col, useNA = "always")
  names(dimnames(tbl)) <- NULL
  print(tbl)
  # print the class of the column
  cat("Class:", paste(class(col), collapse = ", "), "\n")
}

# Print a pretty table
print_reactable <- function(data, sorted_col, width) {
  reactable::reactable(
    data,
    pagination = FALSE,
    height = 650,
    width = width,
    defaultSorted = sorted_col,
    defaultSortOrder = "asc",
    searchable = TRUE,
    striped = TRUE,
    compact = TRUE,
    highlight = TRUE,
    defaultColGroup = reactable::colGroup(headerVAlign = "bottom"),
    defaultColDef = reactable::colDef(
      vAlign = "center",
      headerVAlign = "bottom",
      class = "cell",
      headerClass = "header"
    )
  )
}

# Print pretty summaries
print_summ <- function(model, design, var, term) {
  format_p <- function(p) {
    if (p < 0.001) {
      return("< .001")
    }
    base::format(base::round(p, 3), nsmall = 3)
  }

  term_test <- survey::regTermTest(model, term)

  svy_resid <-
    update(design, .resid = stats::residuals(model, type = "response"))

  var_y <-
    survey::svyvar(stats::as.formula(paste0("~", var)), design = svy_resid)[1]

  var_e <- survey::svyvar(~.resid, design = svy_resid)[1]

  r2 <- 1 - (var_e / var_y)
  cohens_f <- base::sqrt(r2 / (1 - r2))
  percent_var_explained <- r2 * 100

  tibble::tibble(
    Ward_F = 
      base::format(base::round(base::as.numeric(term_test$Ftest[1]), 2), nsmall = 2),
    df1 = term_test$df,
    df2 = term_test$ddf,
    p = format_p(term_test$p),
    r2 = base::format(base::round(r2, 4), nsmall = 4),
    cohens_f = base::format(base::round(cohens_f, 4), nsmall = 4),
    percent_var_explained =
      base::format(base::round(percent_var_explained, 4), nsmall = 4)
    )
}

# Calculate weighted correlation
weighted_corr <- function(dat, var_x, var_y, multiple = FALSE) {
  if (!isTRUE(multiple)) {
  
  var_x <- rlang::ensym(var_x)
  var_y <- rlang::ensym(var_y)
  
  design <- survey::svydesign(
    ids = ~ 1,
    weights = ~ ps_weight,
    data = dat
  )
  
  est <- jtools::svycor(
    stats::as.formula(
      paste0("~", rlang::as_name(var_x), " + ", rlang::as_name(var_y))),
    design,
    sig.stats = TRUE,
    bootn = 1000,
    mean1 = TRUE
  )
  
  data.frame(
    r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
    t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
    p = dplyr::if_else(
      est$p.values[2] < 0.01, "<.001", 
      as.character(format(round(est$p.values[2], 3), nsmall = 3)))
  )
  
  } else {
  outcome_sym <- rlang::ensym(var_x)
  items_val <- rlang::eval_tidy(rlang::enquo(var_y))

  design <- survey::svydesign(ids = ~1, weights = ~ps_weight, data = dat)

  results <- purrr::map_dfr(items_val, function(item_name) {
    f <- stats::as.formula(paste0("~", rlang::as_name(outcome_sym), " + ", item_name))
    est <- jtools::svycor(f, design, sig.stats = TRUE, bootn = 1000, mean1 = TRUE)

    r_val <- est$cors[2]
    t_val <- est$t.values[2]
    p_val <- est$p.values[2]
      
  data.frame(
    item = item_name,
    r = as.character(format(round(est$cors[2], 3), nsmall = 3)),
    t = as.character(format(round(est$t.values[2], 2), nsmall = 2)),
    p = dplyr::if_else(
      est$p.values[2] < 0.01, "<.001", 
      as.character(format(round(est$p.values[2], 3), nsmall = 3)))
  )
    
  })
  return(results)
  }
}

# Define MPWB items and labels
mpwb_items <- c(
  "mpwb_positive_relationships",
  "mpwb_meaning",
  "mpwb_competence",
  "mpwb_engagement",
  "mpwb_self_esteem",
  "mpwb_optimism",
  "mpwb_positive_emotion",
  "mpwb_emotional_stability",
  "mpwb_resilience",
  "mpwb_vitality"
)

mpwb_labels <- c(
  mpwb_positive_relationships = "Positive relationships",
  mpwb_meaning = "Meaning",
  mpwb_competence = "Competence",
  mpwb_engagement = "Engagement",
  mpwb_self_esteem = "Self-esteem",
  mpwb_optimism = "Optimism",
  mpwb_positive_emotion = "Positive emotion",
  mpwb_emotional_stability = "Emotional stability",
  mpwb_resilience = "Resilience",
  mpwb_vitality = "Vitality"
)

phq4_items <- c("phq_interest", "phq_down", "gad_anxious", "gad_worry")

# Define EU countries
eu_countries <- c(
  "Austria",
  "Belgium",
  "Bulgaria",
  "Croatia",
  "Cyprus",
  "Czech Republic",
  "Denmark",
  "Estonia",
  "Finland",
  "France",
  "Germany",
  "Greece",
  "Hungary",
  "Ireland",
  "Italy",
  "Latvia",
  "Netherlands",
  "Poland",
  "Portugal",
  "Romania",
  "Slovakia",
  "Slovenia",
  "Spain",
  "Sweden"
)

# List of countries whose weight scores were replaced by 1.
flagged_countries <- 
  c("Moldova", "Romania", "Nigeria", "Montenegro", "Angola",
    "Morocco", "Uruguay", "Paraguay", "Greece", "Iran",
    "Hungary", "Kosovo", "Yemen", "Chile", "Uganda")

Load data

The data collection began on June 2, 2025, with a soft-launch phase. The survey’s time zone was set to New York City. Due to time zone differences, some responses show a date of June 1, 2025, even though it was already June 2 in the collaborators’ local time. Some collaborators were residing in countries different from their target country.

# Raw public dataset
df_pub_raw <- base::readRDS("999_public_data.rds")

# View number of rows in the raw dataset
nrow(df_pub_raw)
[1] 68311
# Cleaned dataset
df_gmh <- base::readRDS("999_cleaned_data.rds")

# Create general design
svy <- survey::svydesign(ids = ~ 1, weights = ~ ps_weight, data = df_gmh)

# View number of rows in the cleaned dataset
nrow(df_gmh)
[1] 53799
# Codebook
codebook <- readxl::read_excel(
  path = "222_codebook.xlsx",
  sheet = "df_cleaned",
  skip = 1,
  col_names = TRUE
)

A0. Data Preprocessing

A0.1. Cleaning the dataset

Rename columns

df_pub <- df_pub_raw |>
  dplyr::rename(
    duration_sec = `Duration (in seconds)`,
    
    mpwb_competence = Q5,
    mpwb_emotional_stability = Q7,
    mpwb_engagement = Q9,
    mpwb_meaning = Q11,
    mpwb_optimism = Q13,
    mpwb_positive_emotion = Q15,
    mpwb_positive_relationships = Q17,
    mpwb_resilience = Q19,
    mpwb_self_esteem = Q21,
    mpwb_vitality = Q23,
    
    life_satisfaction = Q29,
    income_orig = Q31,
    income_text_orig = Q31_10_TEXT,
    household_size = Q32,
    birth_year_orig = Q25,
    sex_orig = Q26,
    education_orig = Q27,
    employment_orig = Q28,
    ethnicity_citizenship_orig = Q30,
    assets_orig = Q34,
    debts_orig = Q33,
    bot_check = Q43,
    followup = Q35,
    
    phq_interest = Q36_1,
    phq_down = Q36_2,
    gad_anxious = Q36_3,
    gad_worry = Q36_4,
    
    childhood_SES = Q37,
    fin_outlook = Q38,
    fin_outlook_conf = Q39,
    attention_care = Q40,
    work_arrangement = Q41
  ) |>
  dplyr::relocate(Q_Language, .after = UserLanguage) |>
  
  # Overview of the data
  dplyr::glimpse(width = 100)
Rows: 68,311
Columns: 45
$ StartDate                   <dttm> 2025-06-01 07:14:43, 2025-06-01 07:33:44, 2025-06-01 19:24:40…
$ EndDate                     <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:06, 2025-06-01 19:30:50…
$ Status                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Progress                    <dbl> 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 10…
$ duration_sec                <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, …
$ Finished                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ RecordedDate                <dttm> 2025-06-01 07:31:52, 2025-06-01 07:41:07, 2025-06-01 19:30:52…
$ ResponseId                  <chr> "R_2i29tTIFUyYilqv", "R_2nemeLi6AnL1uNP", "R_3LqMY0lbugweTSh",…
$ UserLanguage                <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ Q_Language                  <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "PT-BRA", "P…
$ mpwb_competence             <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability    <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement             <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning                <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism               <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion       <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem            <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality               <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ life_satisfaction           <dbl> 7, 6, 9, 8, 5, 8, 7, 10, 8, 10, 0, 8, 6, 9, 6, 5, 10, 8, 7, 8,…
$ income_orig                 <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, …
$ income_text_orig            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ household_size              <dbl> 3, 5, 1, 4, 5, 4, 2, 4, 2, 1, 1, 6, 5, 6, 3, 4, 10, 7, 1, 12, …
$ birth_year_orig             <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975"…
$ sex_orig                    <dbl> 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2,…
$ education_orig              <dbl> 6, 6, 5, 8, 6, 3, 7, 5, 7, 6, 5, 8, 5, 7, 7, 6, 5, 6, 5, 4, 7,…
$ employment_orig             <chr> "3", "3", "3", "3", "8", "2,8", "3", "3", "1", "6", "8", "3", …
$ ethnicity_citizenship_orig  <chr> "3,6,10", "1,10", "5,10", "5,10", "3,10", "5,10", "1,10", "1,1…
$ assets_orig                 <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,00…
$ debts_orig                  <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.0…
$ bot_check                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ followup                    <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1,…
$ phq_interest                <dbl> NA, 2, NA, 2, 3, 6, 1, NA, 2, 1, NA, 2, NA, 1, 2, 2, NA, 1, 2,…
$ phq_down                    <dbl> NA, 3, NA, 1, 3, 4, 1, NA, 1, 1, NA, 2, NA, 2, 2, 2, NA, 1, 2,…
$ gad_anxious                 <dbl> NA, 2, NA, 2, 3, 7, 2, NA, 2, 1, NA, 3, NA, 2, 3, 2, NA, 1, 1,…
$ gad_worry                   <dbl> NA, 1, NA, 2, 3, 7, 1, NA, 1, 1, NA, 3, NA, 2, 3, 2, NA, 1, 2,…
$ childhood_SES               <dbl> NA, 4, NA, 4, 2, 4, 1, NA, 1, 4, NA, 2, NA, 4, 4, 3, NA, 3, 4,…
$ fin_outlook                 <dbl> NA, 3, NA, 4, 4, 5, 5, NA, 5, 4, NA, 5, NA, 4, 5, 5, NA, 4, 5,…
$ fin_outlook_conf            <dbl> NA, 10, NA, 8, 8, 10, 10, NA, 10, 8, NA, 8, NA, 9, 7, 8, NA, 1…
$ attention_care              <dbl> NA, 5, NA, 5, 7, 4, 5, NA, 5, 6, NA, 5, NA, 5, 6, 5, NA, 5, 4,…
$ work_arrangement            <dbl> NA, 4, NA, 3, NA, 1, 2, NA, 5, NA, NA, 1, NA, 1, 3, 4, NA, 1, …
$ br                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ bs                          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ CoreMPWB_DO                 <chr> "Q4|Q23|Q6|Q21|Q8|Q15|Q10|Q19|Q12|Q9|Q14|Q5|Q16|Q7|Q18|Q17|Q20…

Identification of the Countries

# Sanity check: View the counts of language
table_label(df_pub$UserLanguage)
$UserLanguage
User Language
 AM-ARM  AM-ETH  AR-ARE  AR-BHR  AR-DZA  AR-EGY  AR-KWT  AR-LBN  AR-MAR  AR-OMN  AR-QAT  AR-SAU  AR-TCD  AR-YEM  BG-BGR  BN-BGD  BS-BIH CNR-MNE 
    334     303      66     100     203     322     106     416     302     520     503     296       7     577     393     536     642     358 
 CS-CZE  DA-DNK  DE-AUT  DE-CHE  DE-DEU  EL-CYP  EL-GRC      EN  EN-ARE  EN-AUS  EN-BHR  EN-CAN  EN-EGY  EN-EST  EN-ETH  EN-GBR  EN-GEN  EN-GEO 
    267     338     685     452    1008     218     532    5549     270     605     111     535     547       9     249     852      69      54 
 EN-HKG  EN-HUN  EN-IDN  EN-IND  EN-IRL  EN-KOR  EN-KWT  EN-MNG  EN-MYS  EN-NGA  EN-NLD  EN-NOR  EN-PAK  EN-PHL  EN-QAT  EN-SGP  EN-UGA  EN-YEM 
     17       6      12     921     461      11     209      40     203     721     161      26     347    2280      23     298     332       3 
 EN-ZAF  EN-ZMB  EN-ZWE  ES-ARG  ES-BOL  ES-CHL  ES-ECU  ES-ESP  ES-MEX  ES-PER  ES-PRY  ES-URY   ES-US  ET-EST  FA-IRN  FI-FIN FIL-PHL  FR-BEL 
    279      34     169     769     341     240    1075     729    1164    1031     205     815     159    2393     292     275    1276      70 
 FR-CAN  FR-CHE  FR-FRA  FR-MDG  FR-SEN  FR-TCD  HE-ISR  HI-IND  HR-HRV  HU-HUN  ID-IDN  IT-CHE  IT-ITA  JA-JPN  KA-GEO  KK-KAZ  KO-KOR  KY-KGZ 
    339     292    1175     169     211     185     437     706     455     729    1489      79     566     549     450     131     481     166 
 LV-LVA  MK-MKD  MN-MNG  MS-MYS  NL-BEL  NL-NLD  NO-NOR  PL-POL  PT-AGO  PT-BRA  PT-MOZ  PT-PRT  PT-TLS  RO-MDA  RO-ROU  RU-KAZ  RU-KGZ  RU-RUS 
   1023     268     327     613     261     287     483    1288     329    2094     154     579     277     511     861     656     209    1322 
 RU-UZB  SK-SVK  SL-SVN  SN-ZWE SQI-ALB SQI-XKX  SR-SRB  SR-XKX  SV-SWE  TH-THA  TR-TUR  UK-UKR  UR-PAK  UZ-UZB  ZH-CHN  ZH-HKG  ZH-TWN    <NA> 
    119     724     746     106    2284    1371     420       2    1149     440     682     749     160     543    2523     220     201       0 
Class: character 
# Create column with country names mapped from UserLanguage
country_map <- c(
  "SQI-ALB" = "Albania",
  "AR-DZA" = "Algeria",
  "PT-AGO" = "Angola",
  "ES-ARG" = "Argentina",
  "AM-ARM" = "Armenia",
  "EN-AUS" = "Australia",
  "DE-AUT" = "Austria",
  "AR-BHR" = "Bahrain",
  "EN-BHR" = "Bahrain",
  "BN-BGD" = "Bangladesh",
  "FR-BEL" = "Belgium",
  "NL-BEL" = "Belgium",
  "ES-BOL" = "Bolivia",
  "BS-BIH" = "Bosnia-Herzegovina",
  "PT-BRA" = "Brazil",
  "BG-BGR" = "Bulgaria",
  "EN-CAN" = "Canada",
  "FR-CAN" = "Canada",
  "AR-TCD" = "Chad",
  "FR-TCD" = "Chad",
  "ES-CHL" = "Chile",
  "ZH-CHN" = "China",
  "HR-HRV" = "Croatia",
  "EL-CYP" = "Cyprus",
  "CS-CZE" = "Czech Republic",
  "DA-DNK" = "Denmark",
  "ES-ECU" = "Ecuador",
  "AR-EGY" = "Egypt",
  "EN-EGY" = "Egypt",
  "EN-EST" = "Estonia",
  "ET-EST" = "Estonia",
  "AM-ETH" = "Ethiopia",
  "EN-ETH" = "Ethiopia",
  "FR-FRA" = "France",
  "FI-FIN" = "Finland",
  "EN-GEO" = "Georgia",
  "KA-GEO" = "Georgia",
  "DE-DEU" = "Germany",
  "EL-GRC" = "Greece",
  "EN-HKG" = "Hong Kong",
  "ZH-HKG" = "Hong Kong",
  "EN-HUN" = "Hungary",
  "HU-HUN" = "Hungary",
  "EN-IND" = "India",
  "HI-IND" = "India",
  "ID-IDN" = "Indonesia",
  "EN-IDN" = "Indonesia",
  "FA-IRN" = "Iran",
  "EN-IRL" = "Ireland",
  "HE-ISR" = "Israel",
  "IT-ITA" = "Italy",
  "JA-JPN" = "Japan",
  "KK-KAZ" = "Kazakhstan",
  "RU-KAZ" = "Kazakhstan",
  "EN-KOR" = "Republic of Korea",
  "KO-KOR" = "Republic of Korea",
  "SQI-XKX" = "Kosovo",
  "SR-XKX" = "Kosovo",
  "AR-KWT" = "Kuwait",
  "EN-KWT" = "Kuwait",
  "KY-KGZ" = "Kyrgyzstan",
  "RU-KGZ" = "Kyrgyzstan",
  "LV-LVA" = "Latvia",
  "AR-LBN" = "Lebanon",
  "MK-MKD" = "North Macedonia",
  "FR-MDG" = "Madagascar",
  "MS-MYS" = "Malaysia",
  "EN-MYS" = "Malaysia",
  "ES-MEX" = "Mexico",
  "RO-MDA" = "Moldova",
  "EN-MNG" = "Mongolia",
  "MN-MNG" = "Mongolia",
  "CNR-MNE" = "Montenegro",
  "AR-MAR" = "Morocco",
  "PT-MOZ" = "Mozambique",
  "NL-NLD" = "Netherlands",
  "EN-NLD" = "Netherlands",
  "EN-NGA" = "Nigeria",
  "EN-NOR" = "Norway",
  "NO-NOR" = "Norway",
  "AR-OMN" = "Oman",
  "UR-PAK" = "Pakistan",
  "EN-PAK" = "Pakistan",
  "ES-PRY" = "Paraguay",
  "ES-PER" = "Peru",
  "EN-PHL" = "Philippines",
  "FIL-PHL" = "Philippines",
  "PL-POL" = "Poland",
  "PT-PRT" = "Portugal",
  "AR-QAT" = "Qatar",
  "EN-QAT" = "Qatar",
  "RO-ROU" = "Romania",
  "RU-RUS" = "Russia",
  "AR-SAU" = "Saudi Arabia",
  "FR-SEN" = "Senegal",
  "SR-SRB" = "Serbia",
  "EN-SGP" = "Singapore",
  "SK-SVK" = "Slovakia",
  "SL-SVN" = "Slovenia",
  "EN-ZAF" = "South Africa",
  "ES-ESP" = "Spain",
  "SV-SWE" = "Sweden",
  "FR-CHE" = "Switzerland",
  "DE-CHE" = "Switzerland",
  "IT-CHE" = "Switzerland",
  "ZH-TWN" = "Taiwan",
  "TH-THA" = "Thailand",
  "PT-TLS" = "Timor-Leste",
  "TR-TUR" = "Türkiye",
  "EN-UGA" = "Uganda",
  "UK-UKR" = "Ukraine",
  "AR-ARE" = "UAE",
  "EN-ARE" = "UAE",
  "EN-GBR" = "UK",
  "EN" = "USA",
  "ES-US"  = "USA",
  "ES-URY" = "Uruguay",
  "RU-UZB" = "Uzbekistan",
  "UZ-UZB" = "Uzbekistan",
  "AR-YEM" = "Yemen",
  "EN-YEM" = "Yemen",
  "EN-ZMB" = "Zambia",
  "EN-ZWE" = "Zimbabwe",
  "SN-ZWE" = "Zimbabwe",
  "EN-GEN" = "Global"
)

df_pub <- df_pub |>
  dplyr::mutate(
    
    # Identify country based on UserLanguage
    country = country_map[UserLanguage],
    
    # Transform UserLanguage to ISO codes
    # (the last three characters identify the ISO3 code except USA)
    iso3 = stringr::str_extract(UserLanguage, "[A-Z]{3}$"),
    
    # Clean the code for the USA
    iso3 = dplyr::case_when(
      UserLanguage == "EN" ~ "USA",
      UserLanguage == "ES-US" ~ "USA",
      UserLanguage == "EN-GEN" ~ NA_character_,
      TRUE ~ iso3
      ),
    # Convert ISO3 to ISO2
    iso2 = countrycode::countrycode(
      iso3,
      origin = "iso3c",
      destination = "iso2c",
      custom_match = c("XKX" = "XK"))
  ) |>
  dplyr::relocate(country, iso2, iso3, .after = UserLanguage)

# Sanity check: Cross-tab of countries by language
df_pub |>
  dplyr::count(country, iso2, iso3, sort = TRUE) |>
  dplyr::filter(!is.na(country)) |> 
  print_reactable(sorted_col = "country", width = 500)
# Cleanup
rm(country_map)

Global Version Processing

A global version of the survey was created to ensure people from countries that weren’t specifically targeted in this study or whose native languages weren’t provided could still take part. This version didn’t have any changes made for specific countries. There was only an open-text field for the income item, and all financial items asked for values in USD.

# Identify country and citizenship
gen_ident <- 
  readr::read_csv("111_generic_version_country.csv", show_col_types = FALSE) |> 
  dplyr::glimpse(width = 100)
Rows: 69
Columns: 2
$ ResponseId  <chr> "R_4CJBLtS3qvvRTf7", "R_2EGKdy6ce2zvQls", "R_8f1msPTljX0SGpw", "R_7f1bVmdQG7qh…
$ country_gen <chr> "Australia", "Austria", "Austria", "Bangladesh", "Bangladesh", "Bangladesh", "…
nrow(df_pub)
[1] 68311
df_pub <- df_pub |>
  dplyr::left_join(gen_ident, by = "ResponseId") |>
  dplyr::relocate(country_gen, .after = country)

nrow(df_pub)
[1] 68311
# Sanity check: View the country counts of global version participants
# It was not possible to identify the country for one participant
df_pub |> dplyr::filter(UserLanguage == "EN-GEN") |>
  dplyr::group_by(country_gen) |> 
  dplyr::summarise(n = dplyr::n()) |>
  base::print(n = Inf)
# A tibble: 34 × 2
   country_gen                          n
   <chr>                            <int>
 1 Afghanistan                          1
 2 Australia                            1
 3 Austria                              2
 4 Bangladesh                           4
 5 Belgium                              7
 6 Bhutan                               2
 7 Colombia                             3
 8 Democratic Republic of the Congo     1
 9 Dominican Republic                   1
10 Finland                              1
11 France                               3
12 Germany                              2
13 Guatemala                            1
14 Honduras                             1
15 India                                2
16 Italy                                1
17 Kenya                                1
18 Korea                                1
19 Lebanon                              7
20 Namibia                              1
21 Nepal                                1
22 New Zealand                          2
23 Norway                               1
24 Oman                                 5
25 Pakistan                             1
26 Philippines                          2
27 Sri Lanka                            1
28 Sweden                               1
29 Thailand                             1
30 UAE                                  4
31 UK                                   2
32 Zambia                               3
33 Zimbabwe                             1
34 <NA>                                 1
# Cleanup
rm(gen_ident)

Exclusion of Countries with Small Sample Sizes

We excluded the Global version and Zambia because the sample sizes were not sufficiently large. The Global version does not have the country-specific changes that were made in the target countries, consequently those answers can’t be compared. Zambia is not included because it only has 34 participants, which is less than the 120 required.

# View countries with less than 120 participants
df_pub |> 
  dplyr::group_by(country) |> 
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::filter(n < 120) |>
  dplyr::arrange(n)
# A tibble: 2 × 2
  country     n
  <chr>   <int>
1 Zambia     34
2 Global     69
# Exclude Global version and Zambia
nrow(df_pub)
[1] 68311
df_pub <- df_pub |> 
  dplyr::filter(UserLanguage != "EN-GEN", UserLanguage != "EN-ZMB") |>
  dplyr::select(-country_gen)

nrow(df_pub)
[1] 68208

Location Validation

location <-
  readr::read_csv("111_administrative_location.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 68,208
Columns: 7
$ ResponseId   <chr> "R_4rOh5csuvsUlmsF", "R_9Hk3KD5bE28n9bn", "R_8plovBuEUJfYQRO", "R_5miAsDI8Pi7…
$ loc_country  <chr> "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", "Armenia", …
$ loc_resident <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_admin_1  <chr> "Yerevan", "Yerevan", "Yerevan", "Syunik", "Yerevan", "Yerevan", "Yerevan", "…
$ loc_admin_2  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ lat          <dbl> 40.18720, 40.18720, 40.18720, 39.50899, 40.18720, 40.18720, 40.18720, 40.1872…
$ long         <dbl> 44.51521, 44.51521, 44.51521, 46.34389, 44.51521, 44.51521, 44.51521, 44.5152…
nrow(df_pub)
[1] 68208
df_pub <- df_pub |> 
  dplyr::left_join(location, by = "ResponseId") |>
  dplyr::relocate(
    loc_resident,
    loc_country,
    loc_admin_1,
    loc_admin_2,
    lat,
    long,
    .after = Q_Language
  )

# Sanity check: Number of rows should remain the same
nrow(df_pub)
[1] 68208
# Sanity check: How many missing location validations are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(loc_resident)))
[1] 0
# Sanity check: How many missing latitudes are in the dataset?
nrow(df_pub |> dplyr::filter(is.na(lat) & !is.na(loc_country)))
[1] 0
# Sanity check: View the counts of location validation
df_pub |> dplyr::filter(loc_resident == 0) |> 
  dplyr::group_by(country, loc_resident) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity check: View the counts of administrative level units per country
df_pub |> dplyr::filter(loc_resident == 1) |>
  tidyr::pivot_longer(
    cols = c(loc_admin_1, loc_admin_2),
    names_to = "admin_level",
    values_to = "value"
  ) |>
  dplyr::summarise(
    unique_n = dplyr::n_distinct(value, na.rm = TRUE),
    .by = c(country, admin_level)
  ) |>
  print_reactable(sorted_col = "country", width = 500)
# Cleanup
rm(location)

Merge Sponsored Participants from Ireland’s Team

All participants from the Ireland’s sponsored dataset completed the survey and only the mandatory items were included. Some variables had different options than those in the main dataset.

# Merge the sponsored Irish participants
df_irl_raw <-
  readr::read_csv("999_irish_sponsored_public.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 1,200
Columns: 21
$ utcdateandtime                <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", …
$ participantprivateid          <dbl> 13767545, 13767552, 13767549, 13767554, 13767544, 13767547, …
$ branchpbkg                    <chr> "male", "female", "female", "female", "female", "female", "m…
$ qid12object4response          <chr> "Agree", "Strongly Agree", "Strongly Disagree", "Agree", "St…
$ qid13object6response          <chr> "Strongly Agree", "Agree", "Agree", "Disagree", "Agree", "Ag…
$ qid14object8response          <chr> "Strongly Agree", "Strongly Agree", "Strongly Disagree", "Ag…
$ qid15object9response          <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid16object10response         <chr> "Agree", "Strongly Agree", "Agree", "Disagree", "Agree", "Di…
$ qid17object11response         <chr> "Strongly Agree", "Absolutely Agree", "Agree", "Agree", "Agr…
$ qid18object12response         <chr> "Agree", "Absolutely Agree", "Strongly Agree", "Strongly Agr…
$ qid19object13response         <chr> "Agree", "Agree", "Agree", "Strongly Agree", "Strongly Agree…
$ qid20object14response         <chr> "Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree…
$ qid20object15response         <chr> "Agree", "Agree", "Neutral", "Disagree", "Agree", "Disagree"…
$ qid29object17response         <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7,…
$ born_locationobject5response  <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__ot…
$ educationobject8response      <chr> "Leaving Certificate", "Degree", "Master's", "Technical or V…
$ employmentobject9response     <chr> "Employed full-time", "Employed full-time", "Seeking Employm…
$ incomeobject12quantised       <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5,…
$ incomeobject12response        <chr> "€67,001 - €85,000", "€67,001 - €85,000", "€85,001 - €105,00…
$ P1ageobject377Response        <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, …
$ numhouseholdobject375Response <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, …
df_irl <- df_irl_raw |>
  dplyr::transmute(
    StartDate_irl = utcdateandtime,
    ResponseId = as.character(participantprivateid),
    sex_irl = branchpbkg,
    mpwb_competence = qid12object4response,
    mpwb_emotional_stability = qid13object6response,
    mpwb_engagement = qid14object8response,
    mpwb_meaning = qid15object9response,
    mpwb_optimism = qid16object10response,
    mpwb_positive_emotion = qid17object11response,
    mpwb_positive_relationships = qid18object12response,
    mpwb_resilience = qid19object13response,
    mpwb_self_esteem = qid20object14response,
    mpwb_vitality = qid20object15response,
    life_satisfaction = qid29object17response,
    ethnicity_citizenship_irl = born_locationobject5response,
    # The education options are slightly different from the version
    # used for Ireland non-sponsored participants
    education_irl = educationobject8response,
    # The employment options are slightly different from the version
    # used for Ireland non-sponsored participants
    employment_irl = employmentobject9response,
    # The income brackets are slightly different from the version
    # used for Ireland non-sponsored participants
    income_irl = incomeobject12quantised,
    household_size = numhouseholdobject375Response,
    age = P1ageobject377Response
  ) |>
  dplyr::mutate(

    sex_orig = dplyr::case_when(
      sex_irl == "male" ~ 1,
      sex_irl == "female" ~ 2,
      sex_irl == "other" ~ 3,
      TRUE ~ NA_integer_
    ),

    ethnicity_citizenship_orig = dplyr::case_when(
        # The only options given were "Ireland" and "__other"
        ethnicity_citizenship_irl == "Ireland" ~ "10",
        ethnicity_citizenship_irl == "__other" ~ "11",
        TRUE ~ NA_character_
      ),
    
    education_orig = dplyr::case_when(
      education_irl == "Less than Junior (Inter) Cert" ~ 1,
      education_irl == "Junior (Inter) Certificate or Equivalent" ~ 2,
      education_irl == "Leaving Certificate" ~ 3,
      education_irl == "Technical or Vocational Certificate" ~ 4,
      education_irl == "Diploma" ~ 5,
      education_irl == "Degree" ~ 6,
      education_irl == "Master's" ~ 7,
      education_irl == "Doctorate" ~ 8,
      TRUE ~ NA_integer_
    ),

    employment_orig = dplyr::case_when(
      employment_irl == "Employed full-time" ~ "3",
      employment_irl == "Employed part-time" ~ "4",
      employment_irl == "Student" ~ "1",
      employment_irl == "Seeking Employment/Unemployed" ~ "8",
      employment_irl == "Homemaker/Carer" ~ "7",
      employment_irl == "Unable to Work" ~ "9",
      employment_irl == "Retired" ~ "6",
      # The option below is not in the original coding scheme
      employment_irl == "Self-employed" ~ NA_character_,
      TRUE ~ NA_character_
    ),

    income_orig = dplyr::if_else(
      # The option 10 = "Prefer not to say" is recoded to NA
      income_irl == 10,
      NA_integer_,
      income_irl
    ),

    Q_Language = "EN-IRL-sponsored",
    UserLanguage = "EN-IRL-sponsored",
    iso3 = "IRL",
    iso2 = "IE",
    country = "Ireland",
    loc_resident = 1,
    loc_country = "Ireland",
    lat = 53.3861632,
    long = -10.5940283,
    irl = 1

  ) |>
  # We need to recode the MPWB items from text to numerical
  dplyr::mutate(
    dplyr::across(
      dplyr::all_of(mpwb_items),
      ~ as.numeric(base::factor(
        .,
        levels = c(
          # first level will be coded as 1
          "Absolutely Disagree",
          # second level will be coded as 2, etc.
          "Strongly Disagree",
          "Disagree",
          "Neutral",
          "Agree",
          "Strongly Agree",
          "Absolutely Agree"
          )
  )))) |>
  dplyr::glimpse(width = 100)
Rows: 1,200
Columns: 35
$ StartDate_irl               <chr> "04/07/2025 09:30", "04/07/2025 09:35", "04/07/2025 09:36", "0…
$ ResponseId                  <chr> "13767545", "13767552", "13767549", "13767554", "13767544", "1…
$ sex_irl                     <chr> "male", "female", "female", "female", "female", "female", "mal…
$ mpwb_competence             <dbl> 5, 6, 2, 5, 6, 5, 5, 5, 4, 5, 4, 4, 6, 7, 7, 5, 6, 6, 5, 7, 2,…
$ mpwb_emotional_stability    <dbl> 6, 5, 5, 3, 5, 5, 6, 5, 3, 5, 4, 5, 5, 6, 7, 5, 5, 7, 6, 5, 2,…
$ mpwb_engagement             <dbl> 6, 6, 2, 5, 5, 5, 6, 3, 5, 5, 5, 5, 5, 7, 7, 6, 5, 4, 5, 4, 2,…
$ mpwb_meaning                <dbl> 6, 7, 5, 5, 5, 4, 5, 5, 4, 5, 2, 5, 5, 7, 7, 5, 7, 7, 5, 7, 3,…
$ mpwb_optimism               <dbl> 5, 6, 5, 3, 5, 3, 6, 5, 4, 5, 3, 5, 5, 7, 7, 5, 7, 7, 6, 7, 3,…
$ mpwb_positive_emotion       <dbl> 6, 7, 5, 5, 5, 4, 6, 5, 4, 5, 3, 5, 5, 5, 6, 5, 6, 7, 7, 5, 3,…
$ mpwb_positive_relationships <dbl> 5, 7, 6, 6, 5, 5, 7, 5, 5, 6, 3, 6, 5, 4, 6, 6, 5, 6, 6, 5, 2,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 6, 4, 5, 5, 5, 4, 5, 4, 5, 5, 5, 6, 6, 7, 6, 6, 3,…
$ mpwb_self_esteem            <dbl> 5, 6, 5, 5, 6, 4, 5, 5, 4, 5, 3, 5, 5, 1, 4, 5, 6, 7, 6, 5, 2,…
$ mpwb_vitality               <dbl> 5, 5, 4, 3, 5, 3, 5, 5, 4, 5, 3, 4, 4, 1, 4, 5, 6, 7, 4, 4, 2,…
$ life_satisfaction           <dbl> 8, 7, 7, 6, 8, 7, 7, 8, 6, 6, 2, 6, 7, 3, 6, 7, 8, 10, 9, 7, 2…
$ ethnicity_citizenship_irl   <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "__othe…
$ education_irl               <chr> "Leaving Certificate", "Degree", "Master's", "Technical or Voc…
$ employment_irl              <chr> "Employed full-time", "Employed full-time", "Seeking Employmen…
$ income_irl                  <dbl> 6, 6, 7, 3, 7, 2, 5, 6, 10, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ household_size              <dbl> 1, 4, 2, 4, 3, 3, 3, 2, 4, 2, 5, 1, 1, 2, 1, 1, 2, 2, 4, 2, 3,…
$ age                         <dbl> 54, 24, 46, 23, 46, 49, 70, 37, 51, 34, 56, 31, 29, 48, 24, 34…
$ sex_orig                    <dbl> 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2,…
$ ethnicity_citizenship_orig  <chr> "10", "10", "10", "10", "10", "11", "10", "10", "10", "10", "1…
$ education_orig              <dbl> 3, 6, 7, 4, 6, 6, 7, 4, 5, 6, 7, 3, 7, 7, 7, 6, 3, 7, 6, 3, 6,…
$ employment_orig             <chr> "3", "3", "8", "3", "3", "7", "6", "3", "7", "3", "7", "4", "3…
$ income_orig                 <dbl> 6, 6, 7, 3, 7, 2, 5, 6, NA, 3, 5, 1, 8, 4, 4, 5, 6, 7, 9, 5, 1…
$ Q_Language                  <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ UserLanguage                <chr> "EN-IRL-sponsored", "EN-IRL-sponsored", "EN-IRL-sponsored", "E…
$ iso3                        <chr> "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL", "IRL",…
$ iso2                        <chr> "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "IE", "I…
$ country                     <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ loc_resident                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ loc_country                 <chr> "Ireland", "Ireland", "Ireland", "Ireland", "Ireland", "Irelan…
$ lat                         <dbl> 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53.38616, 53…
$ long                        <dbl> -10.59403, -10.59403, -10.59403, -10.59403, -10.59403, -10.594…
$ irl                         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
# Sanity check: View the counts of each option
base::table(df_irl$mpwb_competence, df_irl_raw$qid12object4response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  18     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                22    0
  3                   0                   0     0       59       0              0                 0    0
  4                   0                   0     0        0     262              0                 0    0
  5                   0                   0   498        0       0              0                 0    0
  6                   0                   0     0        0       0            229                 0    0
  7                 112                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_emotional_stability, df_irl_raw$qid13object6response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  20     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                39    0
  3                   0                   0     0      148       0              0                 0    0
  4                   0                   0     0        0     231              0                 0    0
  5                   0                   0   490        0       0              0                 0    0
  6                   0                   0     0        0       0            192                 0    0
  7                  80                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_engagement, df_irl_raw$qid14object8response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  11     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                18    0
  3                   0                   0     0      111       0              0                 0    0
  4                   0                   0     0        0     362              0                 0    0
  5                   0                   0   480        0       0              0                 0    0
  6                   0                   0     0        0       0            157                 0    0
  7                  61                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_meaning, df_irl_raw$qid15object9response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  24     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                26    0
  3                   0                   0     0       82       0              0                 0    0
  4                   0                   0     0        0     264              0                 0    0
  5                   0                   0   479        0       0              0                 0    0
  6                   0                   0     0        0       0            213                 0    0
  7                 112                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_optimism, df_irl_raw$qid17object11response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  21     1        8       3              2                 4    0
  2                   0                   2     0       15      13              1                 8    0
  3                   0                   0    21       36      45              2                 4    0
  4                   4                   2   111       20     144             17                 2    0
  5                  15                   0   302        7      47             51                 1    0
  6                  26                   0    52        0       9             95                 0    0
  7                  66                   0    20        0       2             20                 1    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_positive_emotion, df_irl_raw$qid15object9response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   1                  15     1        2       0              1                 5    0
  2                   0                   0     5        2       5              2                 6    0
  3                   0                   7    17       22      30              3                 7    0
  4                   4                   1    85       40     112             14                 7    0
  5                  20                   1   293       14      99             79                 1    0
  6                  25                   0    64        2      15             82                 0    0
  7                  62                   0    14        0       3             32                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_positive_relationships, df_irl_raw$qid18object12response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  26     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                22    0
  3                   0                   0     0       84       0              0                 0    0
  4                   0                   0     0        0     179              0                 0    0
  5                   0                   0   498        0       0              0                 0    0
  6                   0                   0     0        0       0            235                 0    0
  7                 156                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_resilience, df_irl_raw$qid19object13response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  23     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                34    0
  3                   0                   0     0      145       0              0                 0    0
  4                   0                   0     0        0     248              0                 0    0
  5                   0                   0   504        0       0              0                 0    0
  6                   0                   0     0        0       0            159                 0    0
  7                  87                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_self_esteem, df_irl_raw$qid20object14response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  39     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                34    0
  3                   0                   0     0      110       0              0                 0    0
  4                   0                   0     0        0     264              0                 0    0
  5                   0                   0   460        0       0              0                 0    0
  6                   0                   0     0        0       0            176                 0    0
  7                 117                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
base::table(df_irl$mpwb_vitality, df_irl_raw$qid20object15response, useNA = "always")
      
       Absolutely Agree Absolutely Disagree Agree Disagree Neutral Strongly Agree Strongly Disagree <NA>
  1                   0                  55     0        0       0              0                 0    0
  2                   0                   0     0        0       0              0                87    0
  3                   0                   0     0      247       0              0                 0    0
  4                   0                   0     0        0     325              0                 0    0
  5                   0                   0   330        0       0              0                 0    0
  6                   0                   0     0        0       0            108                 0    0
  7                  48                   0     0        0       0              0                 0    0
  <NA>                0                   0     0        0       0              0                 0    0
# For the rows that are not in the Irish dataset
df_pub$irl <- 0

# Merge both datasets
df_merged <- dplyr::bind_rows(df_pub, df_irl) |>
  dplyr::relocate(StartDate_irl, .after = StartDate) |>
  dplyr::relocate(Q_Language, .after = ResponseId) |>
  dplyr::relocate(age, .after = birth_year_orig) |>
  dplyr::relocate(sex_irl, .after = sex_orig) |>
  dplyr::relocate(ethnicity_citizenship_irl, .after = ethnicity_citizenship_orig) |>
  dplyr::relocate(employment_irl, .after = employment_orig) |>
  dplyr::relocate(education_irl, .after = education_orig) |>
  dplyr::relocate(income_irl, .after = income_orig)

# Total sample size before individual exclusion criteria
nrow(df_merged)
[1] 69408
# Sanity check:
# Is the sum of rows of both individual datasets equal to the merged dataset?
(length(df_irl$ResponseId) + length(df_pub$ResponseId)) ==
  length(df_merged$ResponseId)
[1] TRUE
# Extract the labels from df_pub and place them back to df_merged
for (i in intersect(names(df_pub), names(df_merged))) {
    attr(df_merged[[i]], "label") <- attr(df_pub[[i]], "label")
}

# Cleanup
rm(df_irl_raw)

MPWB

# Sanity check: View the counts of each option
for (i in mpwb_items) {
  eval(parse(text = sprintf("table_label(df_pub$%s)", i)))
  cat("\n")
}
$mpwb_positive_relationships
I receive help and support from people I am close to when I need it.
    1     2     3     4     5     6     7  <NA> 
 1751  1955  4594  7323 21636 14217 11715  5017 
Class: numeric 

$mpwb_meaning
I feel what I do in my life is valuable and worthwhile.
    1     2     3     4     5     6     7  <NA> 
 1966  2448  5393  8627 21037 13474 10271  4992 
Class: numeric 

$mpwb_competence
I feel a sense of accomplishment from what I do.
    1     2     3     4     5     6     7  <NA> 
 1907  2582  6319  9177 22141 13119  7974  4989 
Class: numeric 

$mpwb_engagement
I feel absorbed in what I am doing.
    1     2     3     4     5     6     7  <NA> 
 1229  2041  6701 11039 22993 12211  7040  4954 
Class: numeric 

$mpwb_self_esteem
I feel positive about myself.
    1     2     3     4     5     6     7  <NA> 
 2009  2703  6688  9186 21232 12895  8582  4913 
Class: numeric 

$mpwb_optimism
I am optimistic about my future.
    1     2     3     4     5     6     7  <NA> 
 2770  3118  6488 10098 19560 11972  9255  4947 
Class: numeric 

$mpwb_positive_emotion
I feel happy.
    1     2     3     4     5     6     7  <NA> 
 2114  2673  6253 12053 21446 11308  7423  4938 
Class: numeric 

$mpwb_emotional_stability
I feel calm and peaceful.
    1     2     3     4     5     6     7  <NA> 
 2571  3882 10398 11446 19752  9404  5835  4920 
Class: numeric 

$mpwb_resilience
I recover quickly from things that go wrong in my life.
    1     2     3     4     5     6     7  <NA> 
 2385  3916 10520 10552 21264  9366  5205  5000 
Class: numeric 

$mpwb_vitality
I feel full of energy.
    1     2     3     4     5     6     7  <NA> 
 3422  5107 11610 12272 17740  8110  5015  4932 
Class: numeric 
df_merged <- df_merged |>
  dplyr::rowwise() |>
  dplyr::mutate(
    # Identify participants that completed all MPWB items
    mpwb_n = base::sum(!is.na(dplyr::c_across(dplyr::all_of(mpwb_items)))),
    
    # Calculate variance, average and sum score of the MPWB items
    # explicitly to only for participants who answered all MPWB items
    mpwb_mean = dplyr::if_else(
      mpwb_n == 10,
      base::mean(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    ),

    mpwb_var = dplyr::if_else(
      mpwb_n == 10,
      stats::var(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    ),

    mpwb_sum = dplyr::if_else(
      mpwb_n == 10,
      base::sum(dplyr::c_across(dplyr::all_of(mpwb_items))),
      NA_real_
    )
  ) |>
  
  # remove the rowwise computation
  dplyr::ungroup() |>
  
  # organise the variables positions
  dplyr::relocate(mpwb_n:mpwb_sum, .after = mpwb_vitality)
  
# Sanity check: View the new MPWB variables
dplyr::glimpse(df_merged |> dplyr::select(dplyr::starts_with("mpwb_")), width = 100)
Rows: 69,408
Columns: 14
$ mpwb_competence             <dbl> 6, 5, 5, 5, 5, 5, 5, 7, 5, 5, 4, 7, 5, 5, 5, 4, 6, 6, 5, 5, 7,…
$ mpwb_emotional_stability    <dbl> 6, 3, 5, 5, 5, 4, 5, 7, 6, 7, 5, 5, 4, 7, 5, 4, 7, 6, 4, 5, 5,…
$ mpwb_engagement             <dbl> 6, 6, 5, 6, 5, 4, 5, 7, 3, 5, 5, 6, 7, 7, 5, 4, 4, 5, 5, 3, 7,…
$ mpwb_meaning                <dbl> 6, 3, 5, 6, 4, 4, 6, 7, 5, 6, 4, 5, 4, 6, 5, 4, 7, 5, 4, 5, 7,…
$ mpwb_optimism               <dbl> 7, 5, 5, 6, 5, 3, 7, 7, 6, 7, 4, 6, 5, 6, 5, 4, 7, 5, 6, 6, 7,…
$ mpwb_positive_emotion       <dbl> 5, 3, 5, 6, 5, 7, 6, 7, 7, 5, 4, 5, 6, 6, 4, 4, 7, 5, 5, 5, 7,…
$ mpwb_positive_relationships <dbl> 5, 5, 5, 4, 7, 7, 6, 7, 5, 7, 5, 6, 5, 6, 5, 4, 6, 5, 4, 3, 7,…
$ mpwb_resilience             <dbl> 5, 5, 5, 6, 5, 3, 6, 7, 5, 7, 4, 6, 4, 6, 4, 4, 7, 7, 4, 3, 6,…
$ mpwb_self_esteem            <dbl> 6, 5, 5, 7, 3, 4, 7, 7, 6, 5, 4, 7, 5, 6, 5, 4, 6, 6, 6, 6, 7,…
$ mpwb_vitality               <dbl> 5, 1, 4, 5, 3, 4, 5, 7, 6, 7, 4, 6, 4, 6, 5, 4, 5, 5, 5, 5, 5,…
$ mpwb_n                      <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
$ mpwb_mean                   <dbl> 5.7, 4.1, 4.9, 5.6, 4.7, 4.5, 5.8, 7.0, 5.4, 6.1, 4.3, 5.9, 4.…
$ mpwb_var                    <dbl> 0.4555556, 2.3222222, 0.1000000, 0.7111111, 1.3444444, 2.05555…
$ mpwb_sum                    <dbl> 57, 41, 49, 56, 47, 45, 58, 70, 54, 61, 43, 59, 49, 61, 48, 40…
# Sanity check: Are there missing values in the sum score when mpwb_n is 10?
base::table(df_merged$mpwb_n, is.na(df_merged$mpwb_sum), useNA = "always")
      
       FALSE  TRUE  <NA>
  0        0  3247     0
  1        0   722     0
  2        0   396     0
  3        0   372     0
  4        0   275     0
  5        0   239     0
  6        0   240     0
  7        0   193     0
  8        0   152     0
  9        0   174     0
  10   63398     0     0
  <NA>     0     0     0
# Sanity check: Are there values in the var score when mpwb_n is not 10?
df_merged |> dplyr::filter(mpwb_n != 10 & (!is.na(mpwb_var))) |> base::nrow()
[1] 0
# Cleanup
rm(i)

Completion time

df_merged <- df_merged |>
  dplyr::rowwise() |>
  dplyr::mutate(

    # Count how many items were answered (not NA) after the debts item
    # (all items up to the debts item were forced-response)
    n_items_after = base::sum(!is.na(dplyr::c_across(
      c(
        followup,
        phq_interest,
        phq_down,
        gad_anxious,
        gad_worry,
        childhood_SES,
        fin_outlook,
        fin_outlook_conf,
        attention_care,
        work_arrangement
      )
    ))),

    # Calculate adjusted duration if the mandatory items were completed.
    # Some survey versions have different variables of the same item,
    # but all versions have 20 mandatory items before debts.
    total_items = dplyr::if_else(
      !is.na(debts_orig),
      20 + n_items_after,
      NA_real_),

    duration_adj = dplyr::if_else(
      !is.na(debts_orig),
      duration_sec / total_items,
      NA_real_)

  ) |>
  dplyr::ungroup() |>

  # organise the variables positions
  dplyr::relocate(n_items_after:duration_adj, .after = duration_sec)

# Sanity check: View the new variables
dplyr::glimpse(df_merged |> dplyr::select(duration_sec:duration_adj), width = 100)
Rows: 69,408
Columns: 4
$ duration_sec  <dbl> 1028, 442, 370, 426, 512, 344, 341, 744, 582, 1006, 233, 173, 6735, 120, 270…
$ n_items_after <int> 1, 10, 1, 10, 9, 10, 10, 1, 10, 9, 1, 10, 1, 10, 10, 10, 1, 10, 9, 1, 9, 10,…
$ total_items   <dbl> 21, 30, 21, 30, 29, 30, 30, 21, 30, 29, 21, 30, 21, 30, 30, 30, 21, 30, 29, …
$ duration_adj  <dbl> 48.952381, 14.733333, 17.619048, 14.200000, 17.655172, 11.466667, 11.366667,…
# Sanity check: Is there a mismatch between n_items_after and total_items?
base::table(df_merged$n_items_after, df_merged$total_items, useNA = "always")
      
          20    21    24    25    26    27    28    29    30  <NA>
  0     1899     0     0     0     0     0     0     0     0 13381
  1        0 14144     0     0     0     0     0     0     0     0
  4        0     0     6     0     0     0     0     0     0     0
  5        0     0     0    44     0     0     0     0     0     0
  6        0     0     0     0    80     0     0     0     0     0
  7        0     0     0     0     0     1     0     0     0     0
  8        0     0     0     0     0     0  1313     0     0     0
  9        0     0     0     0     0     0     0  9303     0     0
  10       0     0     0     0     0     0     0     0 29237     0
  <NA>     0     0     0     0     0     0     0     0     0     0
# Sanity check: Is there unexpected missing values in total_items?
df_merged |>
  dplyr::summarise(
    all_total_items_missing_when_debts_missing =
      all(is.na(total_items[is.na(debts_orig)])),
    any_total_items_present_when_debts_missing =
      any(!is.na(total_items[is.na(debts_orig)])))
# A tibble: 1 × 2
  all_total_items_missing_when_debts_missing any_total_items_present_when_debts_missing
  <lgl>                                      <lgl>                                     
1 TRUE                                       FALSE                                     
# Sanity check: Is there unexpected missing values in n_items_after?
base::table(df_merged$n_items_after, is.na(df_merged$debts_orig), useNA = "always")
      
       FALSE  TRUE  <NA>
  0     1899 13381     0
  1    14144     0     0
  4        6     0     0
  5       44     0     0
  6       80     0     0
  7        1     0     0
  8     1313     0     0
  9     9303     0     0
  10   29237     0     0
  <NA>     0     0     0
# Sanity check: View the range of duration_adj
df_merged |>
  dplyr::filter(!is.na(debts_orig)) |>
  dplyr::summarise(
    min_duration_adj = min(duration_adj, na.rm = TRUE),
    max_duration_adj = max(duration_adj, na.rm = TRUE)
  )
# A tibble: 1 × 2
  min_duration_adj max_duration_adj
             <dbl>            <dbl>
1              1.7           22309.
# Filter the duration adjusted for plotting
df_sub <- df_merged |>
  filter(duration_adj >= 0, duration_adj <= 25)
# Plot intra-individual variance vs time, faceted by country
ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_var)) +
  ggplot2::geom_point(alpha = 0.25, size = 0.8) +
  ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
  ggplot2::facet_wrap(~ country, scales = "free_y", ncol = 4, nrow = 25) +
  ggplot2::labs(
    x = "Duration adjusted (seconds)",
    y = "Within-person variance across MPWB"
  ) +
  ggplot2::theme(
    strip.text = ggplot2::element_text(size = 9, face = "bold"),
    axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
  )

ggplot2::ggplot(df_sub, ggplot2::aes(x = duration_adj, y = mpwb_sum)) +
  ggplot2::geom_point(alpha = 0.2, size = 0.8) +
  ggplot2::geom_smooth(formula = y ~ x, method = "loess", se = TRUE) +
  ggplot2::facet_wrap(~ country, scales = "free", ncol = 4, nrow = 23) +
  ggplot2::labs(
    x = "Duration adjusted (seconds)",
    y = "MPWB Sum"
  ) +
  ggplot2::theme(
    strip.text = ggplot2::element_text(size = 9, face = "bold"),
    axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)
  )

# Cleanup
rm(df_sub)

PHQ4

The original PHQ-4 has vague verbal anchors that could limit the comparability of results across languages. For example, the option “Several days” could be interpreted as 2-3 days or as more than 7 days in other languages. Since “One week” is not more than 7 days, we decided to recode it as “Several days”.

Used anchors Original anchors Recoded value
Never (1) Not at all (0) 0
Once or twice (1–2) (2) Several days (1) 1
A few days (3–4) (3) Several days (1) 1
Several days (4) Several days (1) 1
One week (5) Several days (1) 1
More than a week (6) More than half the days (2) 2
Every day / nearly every day(7) Nearly every day (3) 3
# Sanity check: View the counts of each option
for (i in phq4_items) {
  eval(parse(text = sprintf("table_label(df_merged$%s)", i)))
  cat("\n")
}
$phq_interest
Over the last 2 weeks, how often have you been bothered by the following problems? - Little interest or pleasure in doing things
    1     2     3     4     5     6     7  <NA> 
 6413 13900  8172  3943  1230  1948  4378 29424 
Class: numeric 

$phq_down
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling down, depressed or hopeless
    1     2     3     4     5     6     7  <NA> 
 8727 14216  6675  3362  1070  2068  3866 29424 
Class: numeric 

$gad_anxious
Over the last 2 weeks, how often have you been bothered by the following problems? - Feeling nervous, anxious or on edge
    1     2     3     4     5     6     7  <NA> 
 5755 13322  7569  4542  1275  2303  5218 29424 
Class: numeric 

$gad_worry
Over the last 2 weeks, how often have you been bothered by the following problems? - Not being able to stop or control worrying
    1     2     3     4     5     6     7  <NA> 
11900 11965  5229  3206  1240  2142  4302 29424 
Class: numeric 
# Function to recode PHQ-4 items.
recode_phq <- function(i) {
  dplyr::case_when(
    i == 1 ~ 0,
    i %in% 2:5 ~ 1,
    i == 6 ~ 2,
    i == 7 ~ 3,
    TRUE ~ NA_real_
  )
}

# Sanity check: Count missing values in PHQ-4 items when gad_worry is not missing
df_merged |>
  dplyr::filter(!is.na(gad_worry)) |>
  dplyr::summarise(
    dplyr::across(dplyr::all_of(phq4_items), ~ base::sum(is.na(.x))),
    n_total = dplyr::n()
  )
# A tibble: 1 × 5
  phq_interest phq_down gad_anxious gad_worry n_total
         <int>    <int>       <int>     <int>   <int>
1            0        0           0         0   39984
# Apply recoding and compute sum scores
# only for participants who answered all PHQ-4 items
# (i.e., not missing in the last PHQ item)
# gad_worry was the last item in the PHQ-4 matrix
df_merged <- df_merged |>
  dplyr::mutate(
    # Calculate the sums for phq2, gad2, and phq4
    # only for participants who answered all PHQ-4 items
    phq2_sum = dplyr::if_else(
      !is.na(gad_worry),
      phq_down + phq_interest,
      NA_real_
    ),

    gad2_sum = dplyr::if_else(
      !is.na(gad_worry),
      gad_worry + gad_anxious,
      NA_real_
    ),

    phq4_sum = phq2_sum + gad2_sum
  ) |>

  dplyr::mutate(
    # Apply the recoding function to the individual PHQ items
    dplyr::across(all_of(phq4_items), recode_phq, .names = "{.col}_rec"),

    # Calculate the sums for recoded phq2, gad2, and phq4
    # only for participants who answered all PHQ-4 items
    phq2_sum_rec = dplyr::if_else(
      !is.na(gad_worry),
      phq_down_rec + phq_interest_rec,
      NA_real_
    ),

    gad2_sum_rec = dplyr::if_else(
      !is.na(gad_worry),
      gad_worry_rec + gad_anxious_rec,
      NA_real_
    ),

    phq4_sum_rec = phq2_sum_rec + gad2_sum_rec,

    # Create a variable with cut-off labels
    phq4_cat = dplyr::case_when(
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 0 & phq4_sum_rec <= 2 ~ "Normal (0–2)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 3 & phq4_sum_rec <= 5 ~ "Mild (3–5)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 6 & phq4_sum_rec <= 8 ~ "Moderate (6–8)",
      !is.na(phq4_sum_rec) & phq4_sum_rec >= 9 & phq4_sum_rec <= 12 ~ "Severe (9–12)",
      # I expect character values, so NA_character_
      TRUE ~ NA_character_
    ),

    # Create variables for depression and anxiety screening,
    # using the standard cut-off of 3 on the respective subscales
    depression_screen = dplyr::case_when(
      is.na(phq2_sum_rec) ~ NA_real_,
      phq2_sum_rec >= 3 ~ 1,
      TRUE ~ 0
    ),
    
    anxiety_screen = dplyr::case_when(
      is.na(gad2_sum_rec) ~ NA_real_,
      gad2_sum_rec >= 3 ~ 1,
      TRUE ~ 0
    )
  ) |>
  dplyr::relocate(phq2_sum:anxiety_screen, .after = gad_worry)

# Sanity checks (view the new variables)
dplyr::glimpse(
  df_merged |>
    dplyr::filter(!is.na(gad_worry)) |>
    dplyr::select(phq_interest:anxiety_screen),
  width = 100
)
Rows: 39,984
Columns: 17
$ phq_interest      <dbl> 2, 2, 3, 6, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 7, 6, 2, 1, 1, 2, 1, 2, 1, 3, …
$ phq_down          <dbl> 3, 1, 3, 4, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 3, 6, 2, 1, 1, 2, 1, 2, 2, 1, …
$ gad_anxious       <dbl> 2, 2, 3, 7, 2, 2, 1, 3, 2, 3, 2, 1, 1, 2, 3, 7, 2, 1, 1, 2, 1, 1, 1, 2, …
$ gad_worry         <dbl> 1, 2, 3, 7, 1, 1, 1, 3, 2, 3, 2, 1, 2, 1, 1, 3, 2, 1, 1, 2, 1, 1, 2, 1, …
$ phq2_sum          <dbl> 5, 3, 6, 10, 2, 3, 2, 4, 3, 4, 4, 2, 4, 3, 10, 12, 4, 2, 2, 4, 2, 4, 3, …
$ gad2_sum          <dbl> 3, 4, 6, 14, 3, 3, 2, 6, 4, 6, 4, 2, 3, 3, 4, 10, 4, 2, 2, 4, 2, 2, 3, 3…
$ phq4_sum          <dbl> 8, 7, 12, 24, 5, 6, 4, 10, 7, 10, 8, 4, 7, 6, 14, 22, 8, 4, 4, 8, 4, 6, …
$ phq_interest_rec  <dbl> 1, 1, 1, 2, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 3, 2, 1, 0, 0, 1, 0, 1, 0, 1, …
$ phq_down_rec      <dbl> 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 0, …
$ gad_anxious_rec   <dbl> 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, …
$ gad_worry_rec     <dbl> 0, 1, 1, 3, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, …
$ phq2_sum_rec      <dbl> 2, 1, 2, 3, 0, 1, 0, 2, 1, 2, 2, 0, 2, 1, 4, 4, 2, 0, 0, 2, 0, 2, 1, 1, …
$ gad2_sum_rec      <dbl> 1, 2, 2, 6, 1, 1, 0, 2, 2, 2, 2, 0, 1, 1, 1, 4, 2, 0, 0, 2, 0, 0, 1, 1, …
$ phq4_sum_rec      <dbl> 3, 3, 4, 9, 1, 2, 0, 4, 3, 4, 4, 0, 3, 2, 5, 8, 4, 0, 0, 4, 0, 2, 2, 2, …
$ phq4_cat          <chr> "Mild (3–5)", "Mild (3–5)", "Mild (3–5)", "Severe (9–12)", "Normal (0–2)…
$ depression_screen <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
$ anxiety_screen    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
# Sanity check: View the range of the recoded variables
base::table(df_merged$phq2_sum_rec, useNA = "always")

    0     1     2     3     4     5     6  <NA> 
 4044  6610 21207  1987  2789   891  2456 29424 
base::table(df_merged$gad2_sum_rec, useNA = "always")

    0     1     2     3     4     5     6  <NA> 
 4500  8337 18360  1861  2634  1130  3162 29424 
base::table(df_merged$phq4_sum_rec, useNA = "always")

    0     1     2     3     4     5     6     7     8     9    10    11    12  <NA> 
 1958  2617  4413  6396 13785  1936  2304  1135  1600   794  1046   496  1504 29424 
# Sanity check: Is there mismatch missing values between the two screenings?
dplyr::count(df_merged, depression_screen, anxiety_screen, name = "n")
# A tibble: 5 × 3
  depression_screen anxiety_screen     n
              <dbl>          <dbl> <int>
1                 0              0 28651
2                 0              1  3210
3                 1              0  2546
4                 1              1  5577
5                NA             NA 29424
# Cleanup
rm(recode_phq, i)

Life Satisfaction

# Item is already numeric and within the scale.
table_label(df_merged$life_satisfaction)
$life_satisfaction
Overall, how satisfied are you with life as a whole these days?
    0     1     2     3     4     5     6     7     8     9    10  <NA> 
 1825   945  1844  3208  4266  6865  9228 12343 11174  5508  6005  6197 
Class: numeric 

Income, Assets, and Debts

The open text field contained a Qualtrics validation that forced participants to answer only with digits [0-9], commas, and periods. However, a small amount of participants managed to enter values beyond this validation (e.g., including percentage signs, letters, or other characters).

For the countries where digits 0-9 are not the default numeric keypad, the translations included instructions requesting that participants use only digits 0-9 (Algeria, Bahrain, Chad, Egypt, Kuwait, Morocco, Oman, Saudi Arabia, UAE, Lebanon, Qatar).

Clean numbers
# Sanity check: View the counts of each option
# Option 10 is "Specify: [open text field]"
table_label(df_merged$income_orig)
$income_orig
Please indicate what your total household income was for 2024 (before taxes). You can select an option or indicate a precise value. If you are retired or live off a pension, please indicate the amount your household received during the year in total payments. - Selected Choice
   0    1    2    3    4    5    6    7    8    9   10 <NA> 
2450 5532 6495 6707 6593 5832 5160 5042 3933 5890 6974 8800 
Class: numeric 
# Sanity check: View the variable types
class(df_merged$income_text_orig)
[1] "character"
class(df_merged$assets_orig)
[1] "character"
class(df_merged$debts_orig)
[1] "character"
# Participants were able to write in a open text field their income, assets, and debts.
head(unique(df_merged$income_text_orig), 20)
 [1] NA         "8000"     "15000"    "7000"     "243000"   "124000"   "12345678" "150000"   "700.000"  "10000"    "50000"    "400"      "2500"    
[14] "1500"     "6000"     "300000"   "3600"     "636000"   "643"      "435"     
head(unique(df_merged$assets_orig), 20)
 [1] "5"         "2"         "20.000"    "1000000"   "5000"      "0,00"      "250000"    "1,000.00"  "50000"     "00"        "0"         "700000"   
[13] "20000000"  "600000"    "70000000"  "1"         "100,000"   "50000000"  "7000000"   "350000000"
head(unique(df_merged$debts_orig), 20)
 [1] "10000000"    "2"           "18000"       "0"           "125000"      "0,00"        "1,000.00"    "20.000,00"   "1500"        "200000000"  
[11] "90000"       "150000"      "200000"      "1300000"     "10,000"      "10000"       "120000"      "66000000"    "100,000,000" "200"        
# View values that end with "," or "."
df_merged |>
  dplyr::filter(grepl("[.,]$", income_text_orig)) |>
  dplyr::select(ResponseId, income_text_orig) |>
  base::nrow();
[1] 0
df_merged |>
  dplyr::filter(grepl("[.,]$", assets_orig)) |>
  dplyr::select(ResponseId, assets_orig);
# A tibble: 18 × 2
   ResponseId        assets_orig 
   <chr>             <chr>       
 1 R_2ilYHj1poprgCX8 1,00,000,   
 2 R_2Iaw1PAzIm22N4f 1,500,000.  
 3 R_7Xai7kgm6ni70up 1000000.    
 4 R_3Ezenl8l5Vbehqq 650000.     
 5 R_8EouiGcGN3SO3RO 200000.     
 6 R_7QEIOPC6sqjQ7jF 300000,     
 7 R_8NwCae5exBdtR98 600,000.    
 8 R_3wBLehUhjYbWTbq 100000.     
 9 R_9d3Tm1Wu2M6gFoh 0,          
10 R_2E6m2ErMvEOb0wx 4000.       
11 R_6elc9peo8vxbMVH 3000.       
12 R_7rYZllIWkzJDJ2X 10.         
13 R_9GHpnvrI5tXbopH 350000.     
14 R_7sbQ258PDYZf45A 2,000000.   
15 R_1Hc7FpY3tW9nsh7 500,000.    
16 R_9dhgf8xvk6Ib8LX 100000.     
17 R_8eOIl90Z2J6iB6k 5,000,000.  
18 R_8CSIx79Mqkq1qaB 600,000,000,
df_merged |>
  dplyr::filter(grepl("[.,]$", debts_orig)) |>
  dplyr::select(ResponseId, debts_orig)
# A tibble: 6 × 2
  ResponseId        debts_orig
  <chr>             <chr>     
1 R_7QEIOPC6sqjQ7jF 23000,    
2 R_1wuhfjwEOnWp9AS 0.        
3 R_16SQZLnjugK3f6p 0.        
4 R_5bW2dvfC8MaUgLB 5,00.     
5 R_8CB0K2YQUfWxGY1 0.        
6 R_5xPMFVkMda7RhuS 18000.    
# Create function to clean numbers
clean_number <- function(i) {
  parse_one <- function(s) {
    # Keep NA as NA
    if (is.na(s))
      return(NA_real_)

    # Remove leading/trailing spaces
    s <- stringr::str_trim(s)

    # first character must be a digit, otherwise NA
    if (!stringr::str_detect(s, "^[0-9]"))
      return(NA_real_)

    # If contains "%" or "x", set to NA
    if (stringr::str_detect(s, "%") || stringr::str_detect(s, "[xX]"))
      return(NA_real_)

    # Handle scientific notation ( if e/E is present)
    if (stringr::str_detect(s, "[eE]")) {
      s_sci <- s |>
        stringr::str_replace_all(",", ".") |>
        stringr::str_replace_all("[^0-9eE+\\-\\.]", "")
      val <- as.numeric(s_sci)
      return(val)

    }

    # Remove non-numeric characters (except "." and ",")
    s <- stringr::str_remove_all(s, "[^0-9,\\.]")

    # Allow "0"
    if (s == "0")
      return(0)

    # Place values of 0.0 / 0.00 / 0,0 / 0,00 / 0,000 as 0
    if (stringr::str_detect(s, "^0[\\.,]0{1,3}$"))
      return(0)

    # Otherwise, anything else starting with 0 and longer than 1 char -> NA
    # For example: "007", "01", "0.7", "0,7", "0.000", "0,000", "0002"
    if (stringr::str_detect(s, "^0") && base::nchar(s) > 1)
      return(NA_real_)

    # Remove "." or "," at the very end
    # For example: "1.000.000." -> "1.000.000"
    s <- stringr::str_replace(s,"[,\\.]$","")

    # Identify last occurrence of "," or "." as decimal separator
    # Some countries use "," as decimal separator and others use "."
    m <- stringr::str_match(s, "([,\\.])([0-9]*)$")

    if (!is.na(m[1])) {
      # Count the number of digits after the last separator
      sep <- m[2]
      digits_after <- m[3]
      len <- base::nchar(digits_after)

      if (len >= 3) {

        # Thousands separator, remove all separators
        # For example: "1.000.000" -> "1000000"
        s <- stringr::str_remove_all(s, "[,.]")

      } else {

        # Decimal, keep only last separator as decimal
        # Remove all other separators
        # For example: "1.000.000.00" -> "1000000.00"
        # "1,000,000,00" -> "1000000,00"
        s_wo_last <- stringr::str_sub(s, 1, nchar(s) - len - 1)
        s_wo_last <- stringr::str_remove_all(s_wo_last, "[,.]")

        # This R session uses "." as decimal separator,
        # so we need to convert accordingly
        # For example: "1000000,00" -> "1000000.00"
        s <- paste0(s_wo_last, ".", digits_after)
      }
    }

    # In R, numerical values have 53 bits of precision (9.0e15),
    # so very large numbers that exceed R's numeric limit will be rounded
    # to the nearest representable double.
    # For example, as.numeric("9999999999999999999") returns 10000000000000002048.
    as.numeric(s)
  }

  vapply(i, parse_one, numeric(1))
}

# Sanity check:
clean_number(c(",1", "0.1", "0,75", "1%", "1000", "1000000,00", "1.000",
               "1,00,000", "1.000.000.00", "1.000.000.", "0010", "10x", "7e-1",
               "9999999999999999999", "0", "0.0", "0,0", "0.00", "0,00", "07",
               "0.7", "0,7", "00", "00,00", "00.00"))
                    ,1                    0.1                   0,75                     1%                   1000             1000000,00 
                    NA                     NA                     NA                     NA                 1000.0              1000000.0 
                 1.000               1,00,000           1.000.000.00             1.000.000.                   0010                    10x 
                1000.0               100000.0              1000000.0              1000000.0                     NA                     NA 
                  7e-1    9999999999999999999                      0                    0.0                    0,0                   0.00 
                   0.7 10000000000000002048.0                    0.0                    0.0                    0.0                    0.0 
                  0,00                     07                    0.7                    0,7                     00                  00,00 
                   0.0                     NA                     NA                     NA                     NA                     NA 
                 00.00 
                    NA 
# Apply function to the values in open text fields
df_merged <- df_merged |>
  dplyr::mutate(
    income_text_clean = clean_number(income_text_orig),
    assets_clean = clean_number(assets_orig),
    debts_clean = clean_number(debts_orig)) |>
  dplyr::relocate(income_text_clean, .after = income_text_orig) |>
  dplyr::relocate(assets_clean, .after = assets_orig) |>
  dplyr::relocate(debts_clean, .after = debts_orig)

# Sanity check: View changes between original and cleaned income text
df_merged |>
  dplyr::mutate(
    income_text_clean = as.character(income_text_clean),
    n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
  ) |>
  dplyr::filter(income_text_clean != income_text_orig) |>
  dplyr::select(
    ResponseId,
    income_text_orig,
    income_text_clean,
    n_digits_orig,
    n_digits_clean
  ) |>
  print_reactable(sorted_col = "income_text_orig", width = 800)
# Sanity check: View changes between original and cleaned assets text
df_merged |>
  dplyr::mutate(
    assets_clean = as.character(assets_clean),
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
  ) |>
  dplyr::filter(assets_clean != assets_orig) |>
  dplyr::select(
    ResponseId,
    assets_orig,
    assets_clean,
    n_digits_orig,
    n_digits_clean
  )  |>
  print_reactable(sorted_col = "assets_orig", width = 800)
# Sanity check: View the new cleaned variables
df_merged |>
  dplyr::select(
    income_orig,
    income_text_orig,
    income_text_clean,
    assets_orig,
    assets_clean,
    debts_orig,
    debts_clean
  ) |>
  dplyr::glimpse(width = 150)
Rows: 69,408
Columns: 7
$ income_orig       <dbl> 7, 9, 6, 9, 7, 8, 5, 4, 5, 7, 1, 5, 4, 6, 7, 6, 10, 10, 6, 3, 9, 7, 2, 3, 3, 1, 9, 10, 4, 4, 4, 8, 8, 9, 9, 10, 2, 3, 10, …
$ income_text_orig  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "8000", "15000", NA, NA, NA, NA, NA, NA, NA, NA, NA, "7000…
$ income_text_clean <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8000, 15000, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7000, NA,…
$ assets_orig       <chr> "5", "2", "20.000", "1000000", "5000", "0,00", "250000", "1,000.00", "50000", "00", "0", "1000000", "0", "700000", "200000…
$ assets_clean      <dbl> 5, 2, 20000, 1000000, 5000, 0, 250000, 1000, 50000, NA, 0, 1000000, 0, 700000, 20000000, 0, 0, 600000, 70000000, 1, 0, 0, …
$ debts_orig        <chr> "10000000", "2", "18000", "0", "125000", "0,00", "0", "1,000.00", "0", "20.000,00", "1500", "0", "0", "0", "200000000", "0…
$ debts_clean       <dbl> 10000000, 2, 18000, 0, 125000, 0, 0, 1000, 0, 20000, 1500, 0, 0, 0, 200000000, 0, 90000, 0, 0, 0, 150000, 0, 200000, 0, 0,…
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
  n_income_orig_text = sum(!is.na(income_text_orig)),
  n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
  n_assets_orig = sum(!is.na(assets_orig)),
  n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
  n_debts_orig = sum(!is.na(debts_orig)),
  n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)
# A tibble: 1 × 6
  n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
               <int>                  <int>         <int>             <int>        <int>            <int>
1               6973                      0         56550               861        56027              909
# Sanity check: View the new cleaned variables
# View values that contain non-numeric characters besides "." and ","
df_merged |>
  dplyr::filter(!stringr::str_detect(income_text_orig, "^[0-9,\\.]+$") &
      !is.na(income_orig)) |> select(income_text_orig, income_text_clean) |>
  base::nrow();
[1] 0
df_merged |>
  dplyr::filter(!stringr::str_detect(assets_orig, "^[0-9,\\.]+$") &
      !is.na(assets_orig)) |> select(assets_orig, assets_clean);
# A tibble: 17 × 2
   assets_orig assets_clean
   <chr>              <dbl>
 1 -0                    NA
 2 +10000                NA
 3 40%                   NA
 4 4.5e7           45000000
 5 1.78e10      17800000000
 6 -0                    NA
 7 +80.000               NA
 8 6.265e9       6265000000
 9 -0                    NA
10 0x0                   NA
11 1.3425e10    13425000000
12 10 %                  NA
13 10%                   NA
14 2%                    NA
15 30%                   NA
16 0%                    NA
17 +1000000              NA
df_merged |>
  dplyr::filter(!stringr::str_detect(debts_orig, "^[0-9,\\.]+$") &
      !is.na(debts_orig)) |> select(debts_orig, debts_clean)
# A tibble: 10 × 2
   debts_orig debts_clean
   <chr>            <dbl>
 1 7%                  NA
 2 50%                 NA
 3 8.95e8       895000000
 4 5%                  NA
 5 , 60000             NA
 6 10%                 NA
 7 10%                 NA
 8 20%                 NA
 9 ,0%                 NA
10 +4000               NA
Add financial country-level values
fin_values <-
  readr::read_csv("111_country_variables.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 126
Columns: 23
$ country                          <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "…
$ language                         <chr> "Albanian", "Arabic", "Portuguese", "Spanish", "Armenian"…
$ UserLanguage                     <chr> "SQI-ALB", "AR-DZA", "PT-AGO", "ES-ARG", "AM-ARM", "EN-AU…
$ income_period                    <chr> "monthly", "monthly", "monthly", "monthly", "monthly", "a…
$ income_type                      <chr> "gross", "gross", "gross", "gross", "gross", "gross", "ne…
$ income_year                      <dbl> 2024, 2025, 2024, 2024, 2024, 2025, 2024, 2024, 2024, 202…
$ income_currency                  <chr> "lek", "د.ج", "Kz", "$ (peso)", "ՀՀ դրամ", "AU$", "€", "د…
$ income_currency_position         <chr> "left", "right", "left", "right", "left", "right", "left"…
$ income_cutoff_min                <dbl> 12000, 10000, 50000, 250000, 15000, 40000, 14508, 200, 20…
$ assets_cutoff_min                <dbl> 1000, 10000, 0, 100, 10000, 500, 0, 1000, 1000, 100, 100,…
$ debts_cutoff_min                 <dbl> 1000, 1000, 10000, 100, 10000, 0, 0, 100, 100, 10, 100, 1…
$ assets_upper_limit               <dbl> 40000001, 100000000, NA, 350000000, 50000000, 30000000, 2…
$ debts_upper_limit                <dbl> 50000001, 300000000, NA, 350000000, 50000000, 3000000, 10…
$ wages_per_year                   <dbl> 12, 12, 13, 13, 12, NA, NA, NA, 12, NA, 13, 13, 13, 12, N…
$ inflation2024_factor             <dbl> NA, 1.0010, NA, NA, NA, 1.0182, NA, NA, NA, NA, NA, NA, N…
$ one_local_unit_to_USD_conversion <dbl> 0.010738447, 0.007728573, 0.001149628, 0.001093261, 0.002…
$ one_USD_to_local_unit_conversion <dbl> 93.123, 129.390, 869.846, 914.695, 392.730, 1.531, 0.924,…
$ country_region                   <chr> "Europe & Central Asia", "Middle East, North Africa, Afgh…
$ continent                        <chr> "Europe", "MENA", "Africa", "South America", "Europe", "O…
$ country_incomegroup              <chr> "Upper middle income", "Upper middle income", "Lower midd…
$ soft_launch                      <chr> "June 2", "June 7", "June 2", "June 2", "June 2", "June 5…
$ target_size                      <dbl> 300, 600, 600, 600, 300, 600, 300, 300, 300, 1200, 600, 6…
$ comment_country                  <chr> NA, "Collaborator said that the household income values a…
# Join financial country-level values to the main dataset
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(dplyr::select(fin_values, -country), by = "UserLanguage")

# Sanity check: Number of rows should remain the same
nrow(df_merged)
[1] 69408
Create categorical variables
# Add categorical variable
df_merged <- df_merged |>
  dplyr::mutate(

    # Considers all options
    income_orig_cat_11 =
      dplyr::case_when(
        income_orig == 0 ~ "No income",
        income_orig == 1 ~ "Second decile",
        income_orig == 2 ~ "Third decile",
        income_orig == 3 ~ "Fourth decile",
        income_orig == 4 ~ "Fifth decile",
        income_orig == 5 ~ "Sixth decile",
        income_orig == 6 ~ "Seventh decile",
        income_orig == 7 ~  "Eighth decile",
        income_orig == 8 ~ "Ninth decile",
        income_orig == 9 ~ "Tenth decile",
        income_orig == 10 ~ "Specify",
        TRUE ~ NA_character_
     ),

    # Only considers the first 10 options and gives NA to "Specify"
    income_orig_cat_10 =
      dplyr::case_when(
        income_orig == 0 ~ "No income",
        income_orig == 1 ~ "Second decile",
        income_orig == 2 ~ "Third decile",
        income_orig == 3 ~ "Fourth decile",
        income_orig == 4 ~ "Fifth decile",
        income_orig == 5 ~ "Sixth decile",
        income_orig == 6 ~ "Seventh decile",
        income_orig == 7 ~ "Eighth decile",
        income_orig == 8 ~ "Ninth decile",
        income_orig == 9 ~ "Tenth decile",
        TRUE ~ NA_character_
     )
    ) |>
  dplyr::relocate(income_orig_cat_11, income_orig_cat_10, .after = income_orig)

# Sanity check: View the mapping distribution of the new income variables
df_merged |> dplyr::count(income_orig, income_orig_cat_11)
# A tibble: 12 × 3
   income_orig income_orig_cat_11     n
         <dbl> <chr>              <int>
 1           0 No income           2450
 2           1 Second decile       5532
 3           2 Third decile        6495
 4           3 Fourth decile       6707
 5           4 Fifth decile        6593
 6           5 Sixth decile        5832
 7           6 Seventh decile      5160
 8           7 Eighth decile       5042
 9           8 Ninth decile        3933
10           9 Tenth decile        5890
11          10 Specify             6974
12          NA <NA>                8800
df_merged |> dplyr::count(income_orig, income_orig_cat_10)
# A tibble: 12 × 3
   income_orig income_orig_cat_10     n
         <dbl> <chr>              <int>
 1           0 No income           2450
 2           1 Second decile       5532
 3           2 Third decile        6495
 4           3 Fourth decile       6707
 5           4 Fifth decile        6593
 6           5 Sixth decile        5832
 7           6 Seventh decile      5160
 8           7 Eighth decile       5042
 9           8 Ninth decile        3933
10           9 Tenth decile        5890
11          10 <NA>                6974
12          NA <NA>                8800
Add income bracket information

Country-specific adjustments were applied for an efficient mapping. For example, due to the phrasing, some countries had overlapping values in the brackets: if the last bracket was “more than 4500” and 4500 was the same as the low point of the previous bracket.

# Load the income bracket information and apply country-specific adjustments.
income_recoded <- base::readRDS("111_income_recoded.rds") |>
  dplyr::mutate(
    income_lowpoint =
      dplyr::case_when(
        # Correct Mongolia's income bracket error. Where it reads
        # "₮1,700,001 – ₮2,000,00" should be "₮1,700,001 – ₮2,000,000".
        # Any reasonable person would be able to spot that,
        # if they even noticed it.
        UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 1700001,
        
        # Qatar 5th bracket: AR-QAT: [150000-250000]; EN-QAT: [150001-250000]
        UserLanguage == "AR-QAT" & income_orig == 5 ~ 150001,
        
        # Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
        # per month" in the middle of the deciles.
        UserLanguage == "AR-MAR" & income_orig == 2 ~ 1500,

        TRUE ~ income_lowpoint
    ),

    income_highpoint =
      dplyr::case_when(
        UserLanguage %in% c("MN-MNG", "EN-MNG") & income_orig == 7 ~ 2000000,

        # Correct the third bracket in Morocco because it reads "Around 2,500 dirhams
        # per month" in the middle of the deciles.
        UserLanguage == "AR-MAR" & income_orig == 2 ~ 2500,

        # Correct Uzbekistan's income brackets so the highpoint of each decile
        # matches the lowpoint of the next decile (e.g., coding 14.9 mln as 14999999
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 3 ~ 4999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 4 ~ 9999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 5 ~ 14999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 6 ~ 19999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 7 ~ 24999999,
        UserLanguage %in% c("UZ-UZB", "RU-UZB") & income_orig == 8 ~ 29999999,

        # Georgia 2nd bracket: KA-GEO 0-500; EN-GEO 0-550.
        UserLanguage == "KA-GEO" & income_orig == 1 ~ 550,
        
        # Kyrgyzstan 8th bracket: KY-KGZ [100000-119000]; RU-KGZ [100000-119999]. 
        UserLanguage == "KY-KGZ" & income_orig == 8 ~ 119999,
        
        # Ar-TCD 3rd bracket overlaps with the 2nd bracket and do not match with
        # FR-TCD's 3rd bracket.
        UserLanguage == "AR-TCD" & income_orig == 3 ~ 3000000,

        TRUE ~ income_highpoint
    ),
  )

# Sanity check: Any country have the different brackets across languages?
# We expect to only have differences between Ireland's sponsored and main versions.
income_recoded |>
  dplyr::group_by(country, income_orig) |>
  dplyr::summarise(
    n_lang = dplyr::n_distinct(UserLanguage),
    n_brackets = dplyr::n_distinct(
      paste(income_lowpoint, income_highpoint)
    ),
    bracket_defs = paste0(
      UserLanguage, ": [", income_lowpoint, "-", income_highpoint, "]",
      collapse = "; "
    ),
    .groups = "drop"
  ) |> dplyr::filter(n_lang > 1, n_brackets > 1)
# A tibble: 9 × 5
  country income_orig n_lang n_brackets bracket_defs                                              
  <chr>         <int>  <int>      <int> <chr>                                                     
1 Ireland           1      2          2 EN-IRL: [0-17500]; EN-IRL-sponsored: [0-22000]            
2 Ireland           2      2          2 EN-IRL: [17500-24999]; EN-IRL-sponsored: [22001-32000]    
3 Ireland           3      2          2 EN-IRL: [25000-34999]; EN-IRL-sponsored: [32001-42000]    
4 Ireland           4      2          2 EN-IRL: [35000-49999]; EN-IRL-sponsored: [42001-55000]    
5 Ireland           5      2          2 EN-IRL: [50000-74999]; EN-IRL-sponsored: [55001-67000]    
6 Ireland           6      2          2 EN-IRL: [75000-99999]; EN-IRL-sponsored: [67001-85000]    
7 Ireland           7      2          2 EN-IRL: [100000-149999]; EN-IRL-sponsored: [85001-105000] 
8 Ireland           8      2          2 EN-IRL: [150000-200000]; EN-IRL-sponsored: [105001-137000]
9 Ireland           9      2          2 EN-IRL: [200000-NA]; EN-IRL-sponsored: [137000-NA]        
# Correct gaps between brackets
income_gaps <- income_recoded |>
  dplyr::group_by(UserLanguage) |>
  dplyr::arrange(income_orig, .by_group = TRUE) |>
  
  # First check lowpoints
  dplyr::mutate(
    prev_high = dplyr::lag(income_highpoint),
    expected_low = prev_high + 1,
    has_gap = income_orig >= 2 &
      income_orig <= 8 &
      !is.na(prev_high) &
      !is.na(income_lowpoint) &
      income_lowpoint != expected_low,
    income_lowpoint_adj = dplyr::if_else(
      has_gap,
      expected_low,
      income_lowpoint
    ),
      
    # Then highpoints
    next_low = dplyr::lead(income_lowpoint_adj),
    expected_high = next_low - 1L,
    high_needs_fix =
      income_orig >= 2 &
      income_orig <= 8 &
      !is.na(next_low) &
      !is.na(income_highpoint) &
      income_highpoint != expected_high,
    income_highpoint_adj = dplyr::if_else(
      high_needs_fix,
      expected_high,
      income_highpoint
    )
  ) |>
  dplyr::ungroup()

# Sanity check: View languages where there is a gap
income_gaps |>
  dplyr::filter(has_gap) |>
  dplyr::select(
    UserLanguage,
    income_orig,
    prev_high,
    income_lowpoint,
    expected_low
  )  |>
  print_reactable(sorted_col = "UserLanguage", width = 800)
# Transform income_recoded into a wider format for merging
income_info <- income_gaps |>
  dplyr::select(UserLanguage, income_orig, 
                income_lowpoint, income_lowpoint_adj, 
                income_highpoint, income_highpoint_adj) |>
  tidyr::pivot_longer(
    cols = c(income_lowpoint, income_lowpoint_adj, 
             income_highpoint, income_highpoint_adj),
    names_to = "bound",
    values_to = "value"
  ) |>
  tidyr::pivot_wider(
    names_from = c(bound, income_orig),
    values_from = value,
    names_sep = "_"
  ) |> dplyr::glimpse(width = 100)
Rows: 125
Columns: 37
$ UserLanguage           <chr> "AM-ARM", "AM-ETH", "AR-ARE", "AR-BHR", "AR-DZA", "AR-EGY", "AR-KWT…
$ income_lowpoint_1      <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_lowpoint_adj_1  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ income_highpoint_1     <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_highpoint_adj_1 <dbl> 24000, 600, 60000, 300, 15000, 70000, 500, 18000000, 1500, 499, 500…
$ income_lowpoint_2      <dbl> 24001, 601, 60000, 301, 15000, 70001, 500, 18000000, 1500, 500, 500…
$ income_lowpoint_adj_2  <dbl> 24001, 601, 60001, 301, 15001, 70001, 501, 18000001, 1501, 500, 500…
$ income_highpoint_2     <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_highpoint_adj_2 <dbl> 48000, 1200, 119999, 600, 24999, 200000, 999, 30000000, 2500, 999, …
$ income_lowpoint_3      <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000000, 2500, 1000…
$ income_lowpoint_adj_3  <dbl> 48001, 1201, 120000, 601, 25000, 200001, 1000, 30000001, 2501, 1000…
$ income_highpoint_3     <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_highpoint_adj_3 <dbl> 120000, 1800, 179999, 900, 34999, 400000, 1499, 60000000, 4000, 149…
$ income_lowpoint_4      <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000000, 4000, 150…
$ income_lowpoint_adj_4  <dbl> 120001, 1801, 180000, 901, 35000, 400001, 1500, 60000001, 4001, 150…
$ income_highpoint_4     <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_highpoint_adj_4 <dbl> 192000, 2400, 239999, 1200, 49999, 600000, 1999, 90000000, 6000, 19…
$ income_lowpoint_5      <dbl> 192000, 2401, 240000, 1201, 50000, 600001, 2000, 90000000, 6000, 20…
$ income_lowpoint_adj_5  <dbl> 192001, 2401, 240000, 1201, 50000, 600001, 2000, 90000001, 6001, 20…
$ income_highpoint_5     <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_highpoint_adj_5 <dbl> 383000, 3000, 319999, 1500, 74999, 800000, 2999, 120000000, 8000, 2…
$ income_lowpoint_6      <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000000, 8000, 2…
$ income_lowpoint_adj_6  <dbl> 383001, 3001, 320000, 1501, 75000, 800001, 3000, 120000001, 8001, 2…
$ income_highpoint_6     <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_highpoint_adj_6 <dbl> 575000, 5000, 399999, 1800, 99999, 1200000, 3999, 150000000, 10000,…
$ income_lowpoint_7      <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000000, 10000…
$ income_lowpoint_adj_7  <dbl> 575001, 5001, 400000, 1801, 100000, 1200001, 4000, 150000001, 10001…
$ income_highpoint_7     <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_highpoint_adj_7 <dbl> 960000, 10000, 499999, 2000, 149999, 2400000, 4999, 200000000, 1250…
$ income_lowpoint_8      <dbl> 960000, 10001, 500000, 2001, 150000, 2400001, 5000, 200000000, 1250…
$ income_lowpoint_adj_8  <dbl> 960001, 10001, 500000, 2001, 150000, 2400001, 5000, 200000001, 1250…
$ income_highpoint_8     <dbl> 1200000, 20000, 699999, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_adj_8 <dbl> 1200000, 19999, 699999, 2299, 199999, 4799999, 5999, 299999999, 149…
$ income_lowpoint_9      <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_lowpoint_adj_9  <dbl> 1200001, 20000, 700000, 2300, 200000, 4800000, 6000, 300000000, 150…
$ income_highpoint_9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ income_highpoint_adj_9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(income_info, by = "UserLanguage") |>
  dplyr::relocate(income_lowpoint_1:income_highpoint_adj_9,
    .after = income_orig_cat_10
  )

# Sanity check
nrow(df_merged)
[1] 69408
Identify strange numbers in income, assets, and debts
# Create function to identify strange numbers.
weird_nr <- function(i) {

  # Temporary transform into a character vector so we can use stringr functions
  s <- as.character(i)

  # Flag numbers with the same non-zero digit repeated >=4 (e.g., 1111, 9999)
  # except for zeros.
  rep4 <-
    stringr::str_detect(s, "(?:1111|2222|3333|4444|5555|6666|7777|8888|9999)")

  # Flag sequential numbers of length >= 3 ascending or descending
  # (e.g., 123, 1234, 4321)
  asc3  <- stringr::str_detect(s, "(?:123|234|345|456|567|678|789)")
  desc3 <- stringr::str_detect(s, "(?:321|432|543|654|765|876|987)")

  # Flag repeated 2-digit blocks (e.g., 3939, 1212, 4545)
  repeat2 <- stringr::str_detect(s, "(?!0{2})(\\d{2})\\1+")

  # Combine all flags and check if any is TRUE
  outcome <- (rep4 | asc3 | desc3 | repeat2)

  # Make NAs as not weird
  outcome[is.na(outcome)] <- FALSE

  outcome
}

# Sanity check:
weird_nr(c(999999, 12340, 43210, 3939, 540000, 75000, NA))
[1]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
# Apply function to financial variables
df_merged <- df_merged |>
  dplyr::mutate(
    income_wrd = weird_nr(income_text_clean) |
      # Also detect rows where original text exists but cleaning is NA
      (!is.na(income_text_orig) & is.na(income_text_clean)),

    assets_wrd = weird_nr(assets_clean) |
      (!is.na(assets_orig) & is.na(assets_clean)),

    debts_wrd  = weird_nr(debts_clean) |
      (!is.na(debts_orig) & is.na(debts_clean))
  ) |>
  relocate(income_wrd, .after = income_text_clean) |>
  relocate(assets_wrd, .after = assets_clean) |>
  relocate(debts_wrd, .after = debts_clean)

# Sanity check: View the counts of weird numbers per variable
base::table(df_merged$income_wrd, useNA = "always")

FALSE  TRUE  <NA> 
69353    55     0 
base::table(df_merged$assets_wrd, useNA = "always")

FALSE  TRUE  <NA> 
68386  1022     0 
base::table(df_merged$debts_wrd, useNA = "always")

FALSE  TRUE  <NA> 
68411   997     0 
# Sanity check: View changes between original and cleaned income text
df_merged |>
  dplyr::mutate(
    income_text_clean = as.character(income_text_clean),
    n_digits_orig = stringr::str_count(income_text_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(income_text_clean, "[0-9]")
  ) |>
  dplyr::filter(
    income_text_clean != income_text_orig | 
      (!is.na(income_text_orig) & is.na(income_text_clean))) |>
  dplyr::group_by(
    income_text_orig,
    income_text_clean,
    income_wrd,
    n_digits_orig,
    n_digits_clean
  ) |> 
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "income_text_orig", width = 800)
# Sanity check: View changes between original and cleaned assets text
df_merged |>
  dplyr::mutate(
    assets_clean = as.character(assets_clean),
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(assets_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(assets_clean, "[0-9]")
  ) |>
  dplyr::filter(
    assets_clean != assets_orig | (!is.na(assets_orig) & is.na(assets_clean))) |>
  dplyr::group_by(
    assets_orig,
    assets_clean,
    assets_wrd,
    n_digits_orig,
    n_digits_clean
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_orig", width = 800)
# Sanity check: View changes between original and cleaned debts text
df_merged |>
  dplyr::mutate(
    debts_clean = as.character(debts_clean),
    n_digits_orig = stringr::str_count(debts_orig, "[0-9]"),
    n_digits_clean = stringr::str_count(debts_clean, "[0-9]")
  ) |>
  dplyr::filter(
    debts_clean != debts_orig | (!is.na(debts_orig) & is.na(debts_clean)
    )) |>
  dplyr::group_by(
    debts_orig,
    debts_clean,
    debts_wrd,
    n_digits_orig,
    n_digits_clean
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "debts_orig", width = 500)
# Sanity check: Count missing values in cleaned variables
dplyr::summarise(df_merged,
  n_income_orig_text = sum(!is.na(income_text_orig)),
  n_income_text_clean_na = sum(is.na(income_text_clean) & !is.na(income_text_orig)),
  n_assets_orig = sum(!is.na(assets_orig)),
  n_assets_clean_na = sum(is.na(assets_clean) & !is.na(assets_orig)),
  n_debts_orig = sum(!is.na(debts_orig)),
  n_debts_clean_na = sum(is.na(debts_clean) & !is.na(debts_orig))
)
# A tibble: 1 × 6
  n_income_orig_text n_income_text_clean_na n_assets_orig n_assets_clean_na n_debts_orig n_debts_clean_na
               <int>                  <int>         <int>             <int>        <int>            <int>
1               6973                      0         56550               861        56027              909
# Sanity check: View the rows with NA in cleaned values
# but original text exists
df_merged |> dplyr::group_by(income_text_orig, income_text_clean, income_wrd) |>
  dplyr::filter(!is.na(income_text_orig) & is.na(income_text_clean)) |>
  dplyr::summarise(n = dplyr::n()) |> base::nrow()
[1] 0
df_merged |> dplyr::group_by(assets_orig, assets_clean, assets_wrd) |>
  dplyr::filter(!is.na(assets_orig) & is.na(assets_clean)) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_orig", width = 500)
df_merged |> dplyr::group_by(debts_orig, debts_clean, debts_wrd) |>
  dplyr::filter(!is.na(debts_orig) & is.na(debts_clean)) |>
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "debts_orig", width = 500)
# Cleanup
# We will need income_recoded, income_gaps, and income_info later
rm(fin_values, clean_number, weird_nr)

Household Size

# Sanity check: View the options of this item
table_label(df_merged$household_size)
$household_size
How many people in your household are covered by these finances? Put 1 if you live alone, or if you live with others (e.g., roommates) but are financially independent from them and vice-versa, put 1. Otherwise, list the total number of people living with you that are part of household finances (both incomes and expenses).
    1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19    20  <NA> 
15591 15309 10311  9886  4984  2109   948   596   243   312    71    73    31    29    48    21     8    16    17   115  8690 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    household_size_group = factor(
      dplyr::case_when(
        household_size == 1 ~ "1",
        household_size == 2 ~ "2",
        household_size == 3 ~ "3",
        household_size %in% c(4, 5) ~ "4-5",
        household_size >= 6 ~ "6-20",
        TRUE ~ NA_character_
      ),
      levels = c("1", "2", "3", "4-5", "6-20"),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(household_size_group, .after = household_size)

# Sanity check: View the mapping distribution of the new household size variable
base::table(df_merged$household_size, df_merged$household_size_group, useNA = "always")
      
           1     2     3   4-5  6-20  <NA>
  1    15591     0     0     0     0     0
  2        0 15309     0     0     0     0
  3        0     0 10311     0     0     0
  4        0     0     0  9886     0     0
  5        0     0     0  4984     0     0
  6        0     0     0     0  2109     0
  7        0     0     0     0   948     0
  8        0     0     0     0   596     0
  9        0     0     0     0   243     0
  10       0     0     0     0   312     0
  11       0     0     0     0    71     0
  12       0     0     0     0    73     0
  13       0     0     0     0    31     0
  14       0     0     0     0    29     0
  15       0     0     0     0    48     0
  16       0     0     0     0    21     0
  17       0     0     0     0     8     0
  18       0     0     0     0    16     0
  19       0     0     0     0    17     0
  20       0     0     0     0   115     0
  <NA>     0     0     0     0     0  8690

Birth Year and Age

Participants were able to write in an open text field their birth year, and the validation required values between 1925 and 2007, except for Iran (FA-IRN), where the validation ranged from 1304 to 1386.

# Sanity check: Class of the variable
class(df_merged$birth_year_orig)
[1] "character"
class(df_merged$age)
[1] "numeric"
# Sanity check: View values with non-numeric characters
df_merged |>
  dplyr::filter(!is.na(birth_year_orig) & grepl("\\D", birth_year_orig)) |>
  dplyr::select(birth_year_orig) |> 
  dplyr::distinct() |> 
  base::print(n = Inf)
# A tibble: 41 × 1
   birth_year_orig
   <chr>          
 1 2001.          
 2 1993.          
 3 2000.          
 4 2003.          
 5 2002.          
 6 1995.          
 7 2005.          
 8 1997.          
 9 1969.          
10 1977.          
11 2005,          
12 1985.          
13 1982.          
14 1983.          
15 2004.          
16 1996.          
17 1980.          
18 1994.          
19 2007.          
20 1992.          
21 2006.          
22 1975.          
23 1972.          
24 1978.          
25 1999.          
26 1990.          
27 1989.          
28 1945.          
29 1981.          
30 1971.          
31 1974.          
32 1955.          
33 1949.          
34 1959.          
35 1988.          
36 ,1979          
37 1982.0424      
38 ,1953          
39 1973.01        
40 1963.          
41 1951.          
# Create cleaned column and keep original.
# Calculate age.
df_merged <- df_merged |>
  dplyr::mutate(
    # extract first 4-digit sequence and transform to numerical
    birth_year_clean = 
      as.numeric(stringr::str_extract(birth_year_orig, "\\d{4}")),

    age = dplyr::case_when(
      
      # Keep the values of the participants from the Irish sponsored dataset
      !is.na(age) ~ age,
      
      # If rows in birth year contains NA, then keep NA
      is.na(birth_year_clean) & is.na(age) ~ NA_real_,
      
      # If Q_Language is "FA-IRN",
      # then use the Solar Hijri calendar (1404)
      UserLanguage == "FA-IRN" & !is.na(birth_year_clean) ~ 1404 - birth_year_clean,
      
      # Otherwise, use the Gregorian calendar (2025)
      !is.na(birth_year_clean) ~ 2025 - birth_year_clean,
      
      TRUE ~ NA_real_
    ),
    
    # Create age groups
    age_group = base::factor(dplyr::case_when(
      age >= 18 & age <= 25 ~ "18-25",
      age >= 26 & age <= 44 ~ "26-44",
      age >= 45 & age <= 64 ~ "45-64",
      age >= 65 & age <= 74 ~ "65-74",
      age >= 75 ~ "75+",
      TRUE ~ NA_character_
    ),
    levels = c(
      "18-25",
      "26-44",
      "45-64",
      "65-74",
      "75+"
    ))
  ) |>
  dplyr::relocate(birth_year_clean:age_group, .after = birth_year_orig)


# Sanity check: View the summary of the cleaned birth year
cat(
  "Min: ",
  min(df_merged$birth_year_clean, na.rm = TRUE),
  "\nMax: ",
  max(df_merged$birth_year_clean, na.rm = TRUE),
  "\nNA count: ",
  sum(is.na(df_merged$birth_year_clean)),
  "\nClass: ",
  class(df_merged$birth_year_clean)
)
Min:  1328 
Max:  2007 
NA count:  10380 
Class:  numeric
# Sanity check: Are there rows where raw birth year exists but cleaning failed?
df_merged |> 
  dplyr::filter(!is.na(birth_year_orig) & is.na(birth_year_clean)) |>
  base::nrow()
[1] 0
# Sanity check: View the summary of the age variable
cat(
  "Min: ",
  min(df_merged$age, na.rm = TRUE),
  "\nMax: ",
  max(df_merged$age, na.rm = TRUE),
  "\nNA count: ",
  sum(is.na(df_merged$age)),
  "\nClass: ",
  class(df_merged$age)
)
Min:  18 
Max:  100 
NA count:  9180 
Class:  numeric
# Sanity check: View the mapping distribution of the new age group variable
base::table(df_merged$age_group, useNA = "ifany")

18-25 26-44 45-64 65-74   75+  <NA> 
13674 30944 13098  1988   524  9180 
# Sanity check: Are there rows where raw value exists but age group is missing?
df_merged |> 
  dplyr::filter(!is.na(birth_year_orig) & is.na(age_group)) |>
  base::nrow()
[1] 0
# Sanity check: View the new birth year and age variables
dplyr::glimpse(df_merged |>
                 dplyr::select(birth_year_orig,
                               birth_year_clean,
                               age,
                               age_group),
               width = 100)
Rows: 69,408
Columns: 4
$ birth_year_orig  <chr> "1989", "1984", "1971", "1986", "1993", "2005", "1986", "1975", "1995", "…
$ birth_year_clean <dbl> 1989, 1984, 1971, 1986, 1993, 2005, 1986, 1975, 1995, 1963, 1993, 1981, 2…
$ age              <dbl> 36, 41, 54, 39, 32, 20, 39, 50, 30, 62, 32, 44, 24, 35, 31, 24, 31, 39, 5…
$ age_group        <fct> 26-44, 26-44, 45-64, 26-44, 26-44, 18-25, 26-44, 45-64, 26-44, 45-64, 26-…
# Sanity check: View counts of the sponsored Irish dataset
df_merged |>
  dplyr::filter(irl==1) |>
  dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 67 × 6
# Groups:   UserLanguage, birth_year_orig, birth_year_clean, age [67]
   UserLanguage     birth_year_orig birth_year_clean   age age_group     n
   <chr>            <chr>                      <dbl> <dbl> <fct>     <int>
 1 EN-IRL-sponsored <NA>                          NA    18 18-25         7
 2 EN-IRL-sponsored <NA>                          NA    19 18-25         4
 3 EN-IRL-sponsored <NA>                          NA    20 18-25        12
 4 EN-IRL-sponsored <NA>                          NA    21 18-25         9
 5 EN-IRL-sponsored <NA>                          NA    22 18-25         5
 6 EN-IRL-sponsored <NA>                          NA    23 18-25         9
 7 EN-IRL-sponsored <NA>                          NA    24 18-25        10
 8 EN-IRL-sponsored <NA>                          NA    25 18-25        17
 9 EN-IRL-sponsored <NA>                          NA    26 26-44        11
10 EN-IRL-sponsored <NA>                          NA    27 26-44        16
# ℹ 57 more rows
# Sanity check: View counts of Iran dataset
df_merged |>
  dplyr::filter(UserLanguage == "FA-IRN") |>
  dplyr::group_by(UserLanguage, birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(-birth_year_clean)  |>
  print_reactable(sorted_col = "birth_year_clean", width = 800)
# Sanity check: View counts of main dataset
df_merged |>
  dplyr::filter(irl == 0 & UserLanguage != "FA-IRN") |>
  dplyr::group_by(birth_year_orig, birth_year_clean, age, age_group) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(-birth_year_clean) |>
  print_reactable(sorted_col = "birth_year_clean", width = 800)

Sex

Upon collaborators’ request, the option “I prefer to use: [open text field]” was hidden from the survey versions in Kuwait (AR-KWT; EN-KWT), Egypt (AR-EGY; EN-EGY), Yemen (AR-YEM; EN-YEM), in Algeria (AR-DZA), in Saudi Arabia (AR-SAU), Chad (AR-TCD; FR-TCD), and Bahrain (AR-BHR; EN-BHR).

# Sanity check: View the counts of each option
table_label(df_merged$sex_orig)
$sex_orig
Which best describes you? - Selected Choice
    1     2     3  <NA> 
23444 36194   549  9221 
Class: numeric 
# Load recoded values regarding sex because
# some participants wrote "Female" or "Male" in the open text field
sex_recoded <- 
  readr::read_csv("111_sex_open_answers_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 544
Columns: 2
$ ResponseId       <chr> "R_42tedcZhWdJn9Sk", "R_9hnp095IY8LIkSX", "R_2gvEljyDLXs1Yjm", "R_516n1yU…
$ sex_text_recoded <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot deter…
base::table(sex_recoded$sex_text_recoded, useNA = "always")

Cannot determine           Female             Male       Non-binary            Other             <NA> 
             137               12               17              344               34                0 
df_merged <- df_merged |>
  dplyr::left_join(sex_recoded, by = "ResponseId") |>

  # create a reviewed numeric coding (1 = Male, 2 = Female, 3 = Other)
  dplyr::mutate(
    sex_reviewed = dplyr::case_when(
      sex_text_recoded == "Female" ~ 2,
      sex_text_recoded == "Male" ~ 1,
      sex_text_recoded %in% c("Other", "Non-binary") ~ 3,
      sex_text_recoded == "Cannot determine" ~ NA_real_,
      TRUE ~ sex_orig
    ),

    # categorical factor with explicit levels
    sex_reviewed_cat = factor(
      dplyr::case_when(
        sex_reviewed == 1 ~ "Male",
        sex_reviewed == 2 ~ "Female",
        sex_reviewed == 3 ~ "Other",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female", "Other")
    ),

    # binary numeric: 1 = Male, 0 = Female, NA otherwise
    sex_binary = dplyr::case_when(
      sex_reviewed == 1 ~ 1,
      sex_reviewed == 2 ~ 0,
      TRUE ~ NA_real_
      ),

    # binary factor
    sex_binary_cat = factor(
      dplyr::case_when(
        sex_binary == 1 ~ "Male",
        sex_binary == 0 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    )
  ) |>

  dplyr::relocate(sex_text_recoded:sex_binary_cat, .after = sex_orig)

# Sanity check: Cross-tabs to inspect recoded text vs numeric reviewed code
df_merged |>
  dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
  dplyr::summarise(n = dplyr::n(), .groups = "drop")
# A tibble: 4 × 5
  sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat     n
         <dbl> <fct>                 <dbl> <fct>          <int>
1            1 Male                      1 Male           23461
2            2 Female                    0 Female         36206
3            3 Other                    NA <NA>             383
4           NA <NA>                     NA <NA>            9358
# Sanity check: Cross-tabs to inspect original values vs numeric reviewed code
table(df_merged$sex_orig, df_merged$sex_reviewed, useNA = "always")
      
           1     2     3  <NA>
  1    23444     0     0     0
  2        0 36194     0     0
  3       17    12   383   137
  <NA>     0     0     0  9221
# Sanity check: View the counts of each option
df_merged |>
  dplyr::group_by(sex_reviewed, sex_reviewed_cat, sex_binary, sex_binary_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 4 × 5
# Groups:   sex_reviewed, sex_reviewed_cat, sex_binary [4]
  sex_reviewed sex_reviewed_cat sex_binary sex_binary_cat     n
         <dbl> <fct>                 <dbl> <fct>          <int>
1            1 Male                      1 Male           23461
2            2 Female                    0 Female         36206
3            3 Other                    NA <NA>             383
4           NA <NA>                     NA <NA>            9358
# Cleanup
rm(sex_recoded)

Education Level

The translated education categories of each country will be mapped to a common set of categories. Some countries had a different definition of secondary education, so the mapping will consider if the level is eligible for university entrance or not. The classification of the education levels in each country was agreed upon with the collaborators. The recoded education categories are:

  • Less than secondary (not eligible for university entrance)
  • Secondary (completed the equivalent to high school, and it is eligible for university entrance)
  • Technical (not higher education)
  • University (higher education up to a bachelor’s degree)
  • Advanced (anything beyond a bachelor’s degree)

Note:

  • The team from Ethiopia (AM-ETH and EN-ETH) requested to hide option 7 from their versions of the survey.
  • The team from Peru requested to include an option for “Inclusive Education”. Since this applies across several levels, this option was recoded to NA.
# Sanity check: View the counts of each option
table_label(df_merged$education_orig)
$education_orig
Which is the highest level of education you have completed?
    1     2     3     4     5     6     7     8  <NA> 
  328  1084  8515  8206 19982 13762  4459  3773  9299 
Class: numeric 
# Load the education categories for each country
edu_cat <- 
  readr::read_csv("111_education_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 1,024
Columns: 5
$ UserLanguage          <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM"…
$ education_orig        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,…
$ education_cat         <chr> "No primary education", "Primary (Elementary/Middle School)", "High …
$ education_recoded_cat <chr> "Less than secondary", "Less than secondary", "Secondary", "Technica…
$ education_recoded     <dbl> 1, 1, 2, 3, 4, 4, 5, 5, 1, 1, 2, 3, 4, 5, NA, 5, 1, 1, 2, 3, 4, 5, 5…
# Sanity check: View if there are unexpected values in education_orig
base::table(edu_cat$education_orig, useNA = "always")

   1    2    3    4    5    6    7    8 <NA> 
 128  128  128  128  128  128  128  128    0 
# Sanity check: View if the categories match the expected values
# We expect three cells with missing values regarding 
# Peru's inclusive education level, and level 7 was hidden for Ethiopia (AM-ETH
# and EN-ETH).
edu_cat |>
  dplyr::group_by(education_recoded_cat, education_recoded) |>
  dplyr::summarise(n = dplyr::n()) |> 
  dplyr::arrange(education_recoded)
# A tibble: 6 × 3
# Groups:   education_recoded_cat [6]
  education_recoded_cat education_recoded     n
  <chr>                             <dbl> <int>
1 Less than secondary                   1   275
2 Secondary                             2   138
3 Technical                             3   119
4 University                            4   129
5 Advanced                              5   360
6 <NA>                                 NA     3
edu_cat |> 
  dplyr::filter(is.na(education_recoded))
# A tibble: 3 × 5
  UserLanguage education_orig education_cat       education_recoded_cat education_recoded
  <chr>                 <dbl> <chr>               <chr>                             <dbl>
1 AM-ETH                    7 <NA>                <NA>                                 NA
2 EN-ETH                    7 <NA>                <NA>                                 NA
3 ES-PER                    5 Inclusive education <NA>                                 NA
# Add the education categories to the main data frame
df_merged <- df_merged |>
  dplyr::left_join(
    edu_cat |> dplyr::select(
      UserLanguage,
      education_orig,
      education_cat,
      education_recoded_cat,
      education_recoded
    ),
    by = c("UserLanguage", "education_orig")
  ) |>
  dplyr::mutate(
    education_recoded_cat = base::factor(
      education_recoded_cat,
      levels = c(
        "Less than secondary",
        "Secondary",
        "Technical",
        "University",
        "Advanced"
      ),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(education_cat:education_recoded, .after = education_orig)

# Sanity check: Are there education values without a corresponding
# education_recoded and education_recoded_cat?
df_merged |>
  group_by(
    UserLanguage,
    education_orig,
    education_cat,
    education_recoded_cat,
    education_recoded
  ) |>
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::filter(is.na(education_recoded) & !is.na(education_orig))
# A tibble: 1 × 6
# Groups:   UserLanguage, education_orig, education_cat, education_recoded_cat [1]
  UserLanguage education_orig education_cat       education_recoded_cat education_recoded     n
  <chr>                 <dbl> <chr>               <ord>                             <dbl> <int>
1 ES-PER                    5 Inclusive education <NA>                                 NA    31
# Sanity check: Check Irish sponsored dataset
df_merged |>
  filter(irl == 1) |>
  group_by(
    education_irl,
    education_orig,
    education_cat,
    education_recoded_cat,
    education_recoded) |> dplyr::summarise(n = dplyr::n())
# A tibble: 8 × 6
# Groups:   education_irl, education_orig, education_cat, education_recoded_cat [8]
  education_irl                            education_orig education_cat                            education_recoded_cat education_recoded     n
  <chr>                                             <dbl> <chr>                                    <ord>                             <dbl> <int>
1 Degree                                                6 Degree                                   University                            4   355
2 Diploma                                               5 Diploma                                  Technical                             3   185
3 Doctorate                                             8 Doctorate                                Advanced                              5    11
4 Junior (Inter) Certificate or Equivalent              2 Junior (Inter) Certificate or Equivalent Less than secondary                   1    67
5 Leaving Certificate                                   3 Leaving Certificate                      Secondary                             2   277
6 Less than Junior (Inter) Cert                         1 Less than Junior (Inter) Cert            Less than secondary                   1    13
7 Master's                                              7 Master's                                 Advanced                              5   154
8 Technical or Vocational Certificate                   4 Technical or Vocational Certificate      Technical                             3   138
# Cleanup
rm(edu_cat)

Employment Status

Upon collaborators’ request, the option “Part-time student” was hidden from the versions KA-GEO and EN-GEO in Georgia and SR-SRB in Serbia. The option “Military service” was hidden from the version JA-JPN in Japan.

During the survey completion, participants were not allowed to select conflicting options:

  • Employed full-time and part-time simultaneously.
  • Student full-time and part-time simultaneously.
  • Employed/working full-time or part-time and not in paid employment simultaneously.
  • Military service and not in paid employment simultaneously.
  • Military service and retired simultaneously.
  • Retired and not in paid employment simultaneously.
  • Not in paid employment by choice and looking for work or unable to work due to health/personal reasons simultaneously.
  • Looking for work and unable to work due to health/personal reasons simultaneously.

Employment status was recoded using a sequential rule:

  • Military if the military service option was selected.
  • Employed/working full-time (25+ hours per week) if the full-time employment option was selected.
  • Employed/working part-time (less than 25 hours per week) if the part-time employment option was selected.
  • Not in paid employment (looking for work) if the job-seeking option was selected and no conditions above were met.
  • Student non-working (Full or part-time) if the full- or part-time student was selected and no conditions above were met.
  • Not in paid employment (by choice/health) if not working by choice or for health reasons and no conditions above were met.
  • Retired if the retired option was selected and no conditions above were met.
# Sanity check: View the counts of each option
table_label(df_merged$employment_orig)
$employment_orig
Which most accurately describes you at this moment? You may select up to two options in case you fit more than one category.
    1   1,3   1,4   1,5   1,6   1,7   1,8   1,9     2   2,3   2,4   2,5   2,6   2,7   2,8   2,9     3   3,5   3,6     4   4,5   4,6     5     6     7 
 6511   962   984    70    18   328   671   114  1312  1440   628    34    28   113   267    54 30950   217   189  4737    47   182   589  2592  1839 
    8     9  <NA> 
 3233  1728  9571 
Class: character 
# Replace numeric values with descriptive labels
employment_labels <- c(
  "1" = "Full-time student",
  "2" = "Part-time student",
  "3" = "Employed/working full-time (25+ hours per week)",
  "4" = "Employed/working part-time (less than 25 hours per week)",
  "5" = "Military service",
  "6" = "Retired",
  "7" = "Not in paid employment (by choice)",
  "8" = "Not in paid employment (looking for work)",
  "9" = "Not in paid employment (unable to work due to health/personal reasons)")

# Function to recode multiple-choice values
recode_employment <- function(i) {
  # If row is NA, return NA
  if (is.na(i)) return(NA_character_)
  # Split the string by comma and map to labels
  codes <- strsplit(i, ",")[[1]]
  # Collapse the labels into a single string
  paste(employment_labels[trimws(codes)], collapse = "; ")
}

df_merged <- df_merged |>
  dplyr::mutate(

    # Apply recoding function to create employment_cat variable
    # so instead of "2,5", we have "Part-time student; Military service"
    employment_cat =
           stringr::str_squish(sapply(employment_orig, recode_employment)),

    employment_primary = base::factor(
      dplyr::case_when(

        # Contains option 5
        stringr::str_detect(employment_orig, fixed("5"))
        ~ "Military service",

        # Contains option 3 AND do not contain option 5
        stringr::str_detect(employment_orig, fixed("3")) &
          !(stringr::str_detect(employment_orig, fixed("5")))
        ~ "Employed/working full-time (25+ hours per week)",

        # Contains option 4 AND do not contain option 5
        # (it was not possible to select options 3 and 4 simultaneously)
        stringr::str_detect(employment_orig, fixed("4"))  &
          !(stringr::str_detect(employment_orig, fixed("5")))
        ~ "Employed/working part-time (less than 25 hours per week)",

        # Contains option 8 AND do not contain option 5
        # (it was not possible to select options 8 and 5, 3 or 4 simultaneously)
        stringr::str_detect(employment_orig, fixed("8"))
        ~ "Not in paid employment (looking for work)",

        # Contains option 1 or 2 AND do not contain option 5, 3, 4, or 8
        (stringr::str_detect(employment_orig, fixed("1")) |
        stringr::str_detect(employment_orig, fixed("2"))) &
        !(stringr::str_detect(employment_orig, fixed("5"))) &
        !(stringr::str_detect(employment_orig, fixed("3"))) &
        !(stringr::str_detect(employment_orig, fixed("4"))) &
        !(stringr::str_detect(employment_orig, fixed("8")))
        ~ "Student non-working (Full or part-time)",

        # Contains option 7 or 9 AND do not contain option 1, or 2
        # (it was not possible to select options 7 or 9
        # and 8, 5, 3 or 4 simultaneously)
        (stringr::str_detect(employment_orig, fixed("7")) |
          stringr::str_detect(employment_orig, fixed("9"))) &
        !(stringr::str_detect(employment_orig, fixed("1"))) &
        !(stringr::str_detect(employment_orig, fixed("2")))
        ~ "Not in paid employment (by choice/health)",

        # Contains option 6 AND do not contain option 5, 3, 4, 8, 1, 2, 7 or 9
        # (it was not possible to select options 6 and 7, 8, 9, 5 simultaneously)
        stringr::str_detect(employment_orig, fixed("6")) &
        !(stringr::str_detect(employment_orig, fixed("3"))) &
        !(stringr::str_detect(employment_orig, fixed("4"))) &
        !(stringr::str_detect(employment_orig, fixed("1"))) &
        !(stringr::str_detect(employment_orig, fixed("2")))
        ~ "Retired",

        TRUE ~ NA_character_
      ),

      levels = c(
        "Not in paid employment (by choice/health)",
        "Not in paid employment (looking for work)",
        "Student non-working (Full or part-time)",
        "Employed/working full-time (25+ hours per week)",
        "Employed/working part-time (less than 25 hours per week)",
        "Retired",
        "Military service"
      )
    )
  ) |>
  dplyr::relocate(employment_cat:employment_primary, .after = employment_orig)

# Sanity check: How many options were selected per participant?
df_merged |>
  dplyr::mutate(number_of_options_selected =
           if_else(is.na(employment_orig),
                   NA_integer_,
                   str_count(employment_orig, ",") + 1)) |>
  count(number_of_options_selected)
# A tibble: 3 × 2
  number_of_options_selected     n
                       <dbl> <int>
1                          1 53491
2                          2  6346
3                         NA  9571
# Sanity check: View the distribution of primary employment
base::table(df_merged$employment_primary, useNA = "ifany")

               Not in paid employment (by choice/health)                Not in paid employment (looking for work) 
                                                    3567                                                     4171 
                 Student non-working (Full or part-time)          Employed/working full-time (25+ hours per week) 
                                                    8478                                                    33541 
Employed/working part-time (less than 25 hours per week)                                                  Retired 
                                                    6531                                                     2592 
                                        Military service                                                     <NA> 
                                                     957                                                     9571 
# Sanity check: Cross-tab between primary employment and original employment
print(table(df_merged$employment_primary,
            df_merged$employment_orig, useNA = "ifany"), n = Inf)
                                                          
                                                               1   1,3   1,4   1,5   1,6   1,7   1,8   1,9     2   2,3   2,4   2,5   2,6   2,7   2,8
  Not in paid employment (by choice/health)                    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
  Not in paid employment (looking for work)                    0     0     0     0     0     0   671     0     0     0     0     0     0     0   267
  Student non-working (Full or part-time)                   6511     0     0     0    18   328     0   114  1312     0     0     0    28   113     0
  Employed/working full-time (25+ hours per week)              0   962     0     0     0     0     0     0     0  1440     0     0     0     0     0
  Employed/working part-time (less than 25 hours per week)     0     0   984     0     0     0     0     0     0     0   628     0     0     0     0
  Retired                                                      0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
  Military service                                             0     0     0    70     0     0     0     0     0     0     0    34     0     0     0
  <NA>                                                         0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
                                                          
                                                             2,9     3   3,5   3,6     4   4,5   4,6     5     6     7     8     9  <NA>
  Not in paid employment (by choice/health)                    0     0     0     0     0     0     0     0     0  1839     0  1728     0
  Not in paid employment (looking for work)                    0     0     0     0     0     0     0     0     0     0  3233     0     0
  Student non-working (Full or part-time)                     54     0     0     0     0     0     0     0     0     0     0     0     0
  Employed/working full-time (25+ hours per week)              0 30950     0   189     0     0     0     0     0     0     0     0     0
  Employed/working part-time (less than 25 hours per week)     0     0     0     0  4737     0   182     0     0     0     0     0     0
  Retired                                                      0     0     0     0     0     0     0     0  2592     0     0     0     0
  Military service                                             0     0   217     0     0    47     0   589     0     0     0     0     0
  <NA>                                                         0     0     0     0     0     0     0     0     0     0     0     0  9571
# Sanity check: View the counts of each option
df_merged |>
  dplyr::group_by(employment_orig, employment_cat, employment_primary) |>
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_orig", width = 800)
# Cleanup
rm(recode_employment, employment_labels)

Citizenship and Ethnicity

This item allowed participants to select multiple choices. The first eight options referred to ethnicity. Only some countries contained the options related to ethnicity.

The last three options referred to citizenship status. Participants were not allowed to select Citizen of [country] and Resident of [country] (non-citizen) simultaneously. All countries contained the citizenship options.

Citizenship
# Sanity check: View the counts of each option
table_label(df_merged$ethnicity_citizenship_orig)
$ethnicity_citizenship_orig
Please choose which best describes you. You must select at least one option from the top part and at least one option from the bottom. - Selected Choice
              1,10               1,11             1,2,10             1,2,11           1,2,3,10         1,2,3,4,10       1,2,3,4,5,10 
             24538                444                321                  6                 11                  4                  1 
    1,2,3,4,5,6,10 1,2,3,4,5,6,7,8,11     1,2,3,4,6,7,10       1,2,3,4,7,10   1,2,3,4,7,8,9,10         1,2,3,5,10       1,2,3,5,8,10 
                 1                  1                  1                  1                  1                  2                  1 
      1,2,3,5,9,10         1,2,3,8,10         1,2,3,9,10           1,2,4,10           1,2,4,11         1,2,4,5,10       1,2,4,6,7,10 
                 1                  2                  1                115                  3                 10                  1 
      1,2,4,6,8,10         1,2,4,8,10         1,2,4,8,11         1,2,4,9,10           1,2,5,10           1,2,6,10           1,2,6,11 
                 1                 11                  1                  5                  5                  2                  1 
        1,2,6,8,10           1,2,7,10         1,2,7,8,10           1,2,8,10           1,2,8,11              1,2,9           1,2,9,10 
                 1                  1                  1                 19                  2                 14                 16 
          1,2,9,11             1,3,10             1,3,11           1,3,4,10         1,3,4,5,10         1,3,4,6,10           1,3,5,10 
                 1                164                  2                  8                  1                  1                  3 
           1,3,5,9           1,3,6,10           1,3,8,10           1,3,8,11         1,3,8,9,10         1,3,8,9,11              1,3,9 
                 1                  1                 11                  1                  2                  1                  2 
          1,3,9,10             1,4,10             1,4,11           1,4,5,10           1,4,5,11         1,4,5,8,10            1,4,5,9 
                 5                134                  5                 14                  1                  3                  1 
        1,4,5,9,10           1,4,6,10         1,4,6,8,10           1,4,7,10         1,4,7,8,10           1,4,8,10         1,4,8,9,10 
                 1                  1                  1                  3                  1                  9                  2 
             1,4,9           1,4,9,10           1,4,9,11             1,5,10             1,5,11         1,5,6,8,10         1,5,6,9,10 
                 3                  8                  1                104                  4                  1                  1 
          1,5,7,10           1,5,8,10              1,5,9           1,5,9,10           1,5,9,11             1,6,10             1,6,11 
                 2                 10                  4                  3                  2                 72                  1 
          1,6,7,10           1,6,8,10            1,6,8,9           1,6,9,10           1,6,9,11             1,7,10             1,7,11 
                 1                  5                  1                  3                  1                 46                  1 
          1,7,8,10         1,7,8,9,10              1,7,9           1,7,9,10           1,7,9,11             1,8,10             1,8,11 
                 1                  2                  2                  2                  1                584                 30 
             1,8,9           1,8,9,10           1,8,9,11                1,9             1,9,10             1,9,11                 10 
                28                 17                  2                664                368                 95              10450 
                11               2,10               2,11             2,3,10             2,3,11     2,3,4,5,6,9,10       2,3,4,5,7,10 
               415               2307                222                 14                  3                  1                  1 
         2,3,4,5,9         2,3,4,8,11         2,3,4,9,11           2,3,5,11         2,3,5,9,11         2,3,6,8,10           2,3,7,10 
                 1                  1                  1                  3                  1                  1                  2 
          2,3,8,10         2,3,8,9,11              2,3,9           2,3,9,11             2,4,10             2,4,11         2,4,5,6,10 
                 2                  1                  3                  2                 80                  8                  1 
      2,4,5,6,7,10         2,4,5,7,10           2,4,6,10           2,4,7,10           2,4,8,10              2,4,9           2,4,9,10 
                 1                  1                  2                  1                  2                  2                  2 
            2,5,10             2,5,11           2,5,7,10           2,5,9,11             2,6,10           2,6,7,10           2,6,8,10 
                12                  3                  1                  4                 10                  1                  1 
             2,6,9           2,6,9,11             2,7,10         2,7,8,9,10              2,7,9           2,7,9,10             2,8,10 
                 2                  1                 40                  1                  2                  6                 83 
            2,8,11              2,8,9           2,8,9,10           2,8,9,11                2,9             2,9,10             2,9,11 
                 4                 32                  3                  5                306                 46                 60 
              3,10               3,11             3,4,10         3,4,5,7,10           3,4,6,11           3,4,7,10           3,4,7,11 
              1406                 64                 18                  1                  1                  3                  1 
          3,4,8,10            3,4,8,9              3,4,9           3,4,9,10           3,4,9,11             3,5,10         3,5,6,9,10 
                 2                  1                  2                  2                  1                 10                  1 
          3,5,7,10            3,5,7,9              3,5,9             3,6,10             3,7,10              3,7,9           3,7,9,10 
                 1                  1                  1                 11                 22                  1                  2 
            3,8,10             3,8,11              3,8,9           3,8,9,10           3,8,9,11                3,9             3,9,10 
                53                  6                  5                  4                  1                127                 36 
            3,9,11               4,10               4,11             4,5,10             4,5,11           4,5,7,10           4,5,8,10 
                11               2701                 97                 30                  1                  7                  3 
             4,5,9             4,6,10             4,6,11           4,6,7,10           4,6,8,10              4,6,9             4,7,10 
                 2                 25                  2                  1                  2                  1                135 
            4,7,11           4,7,8,10            4,7,8,9              4,7,9           4,7,9,10           4,7,9,11             4,8,10 
                 2                  2                  1                  6                  9                  2                100 
            4,8,11              4,8,9           4,8,9,10                4,9             4,9,10             4,9,11               5,10 
                 4                  7                  2                179                 56                 28               2617 
              5,11             5,6,10              5,6,9             5,7,10           5,7,8,10             5,8,10             5,8,11 
                45                  2                  1                 46                  4                 48                  2 
             5,8,9           5,8,9,10           5,8,9,11                5,9             5,9,10             5,9,11               6,10 
                 5                  4                  1                 90                 34                 20               1233 
              6,11             6,7,10              6,7,9           6,7,9,10             6,8,10             6,8,11              6,8,9 
                18                 10                  1                  1                 40                  1                  1 
          6,8,9,11                6,9             6,9,10             6,9,11               7,10               7,11             7,8,10 
                 1                 33                 14                 14               4584                 42                 34 
          7,8,9,11                7,9             7,9,10             7,9,11               8,10               8,11                8,9 
                 1                137                108                 34               1715                101                 98 
            8,9,10             8,9,11                  9               9,10               9,11               <NA> 
                47                 38                401                338                269               9825 
Class: character 
# Create variables for each citizenship option (9 to 11)
df_merged <- df_merged |>
  dplyr::mutate(
    citizenship_cat = base::factor(
      dplyr::case_when(

        # When option 10 = "Citizen of [country]" was selected
        # and option 9 = "Born outside [country]" was not selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
        ~ "Citizen",

        # When option 11 = "Resident of [country] (non-citizen)" was selected
        # and option 9 = "Born outside [country]" was not selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")))
        ~ "Non-citizen (Permanent Resident)",

        # When option 9 = "Born outside [country]" was selected
        # and option 10 = "Citizen of [country]" was selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10"))
        ~ "Born outside country (Citizen)",

        # When option 9 = "Born outside [country]" was selected
        # and option 11 = "Resident of [country] (non-citizen)" was selected.
        # It was not possible to select options 10 and 11 simultaneously.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))
        ~ "Born outside country (Non-citizen, Permanent Resident)",

        # When only option 9 = "Born outside [country]" was selected.
        stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("9")) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("11"))) &
        !(stringr::str_detect(ethnicity_citizenship_orig, stringr::fixed("10")))
        ~ "Born outside country (Non-citizen, Non-permanent Resident)",

        TRUE ~ NA_character_
      ),

    levels = c(
      "Citizen",
      "Non-citizen (Permanent Resident)",
      "Born outside country (Citizen)",
      "Born outside country (Non-citizen, Permanent Resident)",
      "Born outside country (Non-citizen, Non-permanent Resident)")
    )
  ) |>
  dplyr::relocate(citizenship_cat, .after = ethnicity_citizenship_orig)

# Sanity check: View the distribution of citizenship categories
df_merged |>
  dplyr::mutate(
    # Extract only the citizenship options selected
    citizenship_extract = stringr::str_extract_all(
      ethnicity_citizenship_orig, "(?<=^|,)(9|10|11)(?=,|$)") |>
      purrr::map_chr(\(i) {
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  ) |> 
  dplyr::group_by(citizenship_extract, citizenship_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 6 × 3
# Groups:   citizenship_extract [6]
  citizenship_extract citizenship_cat                                                n
  <chr>               <fct>                                                      <int>
1 10                  Citizen                                                    54110
2 11                  Non-citizen (Permanent Resident)                            1549
3 9                   Born outside country (Non-citizen, Non-permanent Resident)  2169
4 9,10                Born outside country (Citizen)                              1155
5 9,11                Born outside country (Non-citizen, Permanent Resident)       600
6 <NA>                <NA>                                                        9825
Ethnicity
# Upload the ethnicity categories translated that were used for each country
ethnicity_cat <- 
  readr::read_csv("111_ethnicity_labels_translated.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 533
Columns: 3
$ UserLanguage  <chr> "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AM-ARM", "AR-ARE", "AR-ARE", "AR-AR…
$ option_number <dbl> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 4, 1, 2, 3, 1, 2, 3, 4, 5, 6, 1…
$ label         <chr> "Armenians", "Ezidis", "Russians", "Assyrians", "Ukrainians", "Arab/Middle E…
# Extract the ethnicity options
df_merged <- df_merged |>
  dplyr::mutate(
    # Extract the ethnicity options
    # Don't extract option 8 = "Specify: [open text field]"
    # because that will be added later
    ethnicity_agg = stringr::str_extract_all(
      ethnicity_citizenship_orig,
      "(?<=^|,)(1|2|3|4|5|6|7)(?=,|$)"
    ) |>
      purrr::map_chr(\(i) {
        # Participants that did not complete this item should have NA.
        # Participants that completed a survey version
        # without ethnicity options should have NA.
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  )

# Replace ethnicity options with the translated labels
# Transformation will be conducted in a temporary data frame for safety
df_temp <- df_merged |>
  # Remove missing values for this transformation
  dplyr::filter(!is.na(ethnicity_agg)) |>
  # Separate values into rows
  # (if participant wrote "1,2", create two rows: one with "1" and another with "2")
  tidyr::separate_rows(ethnicity_agg, sep = ",") |>
  # Create variable that is going to match with ethnicity_cat
  dplyr::mutate(option_number = as.numeric(stringr::str_trim(ethnicity_agg))) |>
  # Join ethnicity_cat to get the translated labels
  dplyr::left_join(ethnicity_cat, by = c("UserLanguage", "option_number")) |>
  # Bring back to former format of having multiple options in a single row
  # but now with the translated labels instead of numbers
  dplyr::group_by(ResponseId) |>
  dplyr::summarise(
    ethnicity_translated = base::paste(label[!is.na(label)], collapse = ",")
  )

# Join back to main data frame
nrow(df_merged)
[1] 69408
df_merged <- df_merged |>
  dplyr::left_join(df_temp, by = "ResponseId")

nrow(df_merged)
[1] 69408
# Cleanup
rm(df_temp)

# Sanity check: Are the number of missing values in the new variable the same
# as in the original variable plus those that only selected citizenship options
# or only the please specify option (8)?
sum(is.na(df_merged$ethnicity_translated)) ==
  (sum(is.na(df_merged$ethnicity_citizenship_orig)) + sum(
    !is.na(df_merged$ethnicity_citizenship_orig) &
      stringr::str_detect(
        df_merged$ethnicity_citizenship_orig,
        "^(?:\\s*(?:8|9|10|11)\\s*)(?:,\\s*(?:8|9|10|11)\\s*)*$"
      )
  ))
[1] TRUE
# Add the cleaned responses from the "Specify: [open text field]" option (8)
ethnicity_recoded <- 
  readr::read_csv("111_ethnicity_open_answers_recoded.csv", show_col_types = FALSE) |>
  dplyr::glimpse(width = 100)
Rows: 3,274
Columns: 2
$ ResponseId        <chr> "R_8FrYunIVSiVeX5B", "R_8rYZOG6u8qXwprj", "R_8p9yE9TFIjGUonc", "R_2Lzosf…
$ ethnicity_specify <chr> "Cannot determine", "Cannot determine", "Cannot determine", "Cannot dete…
df_merged <- df_merged |>
  dplyr::left_join(ethnicity_recoded, by = "ResponseId") |>
  dplyr::relocate(ethnicity_agg:ethnicity_specify, .after = ethnicity_citizenship_orig)

# Sanity check
dplyr::glimpse(df_merged |>
  dplyr::group_by(ethnicity_citizenship_orig, UserLanguage) |>
  dplyr::distinct(ethnicity_citizenship_orig, UserLanguage,
           ethnicity_agg, ethnicity_translated, ethnicity_specify,
           .keep_all = TRUE) |>
  dplyr::ungroup() |>
  dplyr::select(UserLanguage, ethnicity_citizenship_orig,
                ethnicity_agg, ethnicity_translated, ethnicity_specify), 
  width = 100)
Rows: 2,567
Columns: 5
$ UserLanguage               <chr> "FR-SEN", "FR-SEN", "PT-BRA", "PT-BRA", "PT-BRA", "FIL-PHL", "P…
$ ethnicity_citizenship_orig <chr> "3,6,10", "1,10", "5,10", "3,10", "1,10", "1,10", "1,9", "2,3,5…
$ ethnicity_agg              <chr> "3,6", "1", "5", "3", "1", "1", "1", "2,3,5", "2", "2", "1,2", …
$ ethnicity_translated       <chr> "Diola / Malinké,Haalpulaaren", "Wolof / Lébou", "White", "Blac…
$ ethnicity_specify          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Mo…
# View values
df_merged |>
  dplyr::filter(!is.na(ethnicity_citizenship_orig)) |>
  dplyr::mutate(
    ethnicity_extract = stringr::str_extract_all(
      ethnicity_citizenship_orig,
      "(?<=^|,)(1|2|3|4|5|6|7|8)(?=,|$)"
    ) |>
      purrr::map_chr(\(i) {
        if (length(i) == 0) return(NA_character_)
        if (all(is.na(i))) return(NA_character_)
        paste(i[!is.na(i)], collapse = ",")
      })
  ) |>
  dplyr::group_by(country, ethnicity_extract, ethnicity_agg, 
                  ethnicity_translated, ethnicity_specify) |>
  dplyr::summarise(n = dplyr::n(), .groups = "drop") |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "country", width = 800)
# Cleanup
rm(ethnicity_cat, ethnicity_recoded)

Honeypot for Bots

An item was added to the survey, and via JavaScript code, the item was hidden from human participants. Bots that do not compute JavaScript code would process this item and provide an answer, which would allow us to identify them.

# NA indicates a valid response.
# All responses were valid, no bots were found through this item.
base::table(df_merged$bot_check, useNA = "always")

 <NA> 
69408 

Optional Ending

This item was optional and can be NA. Also, this item was not shown to sponsored participants.

# The item is already numeric and within the scale.
table_label(df_merged$followup)
$followup
You have now finished the main survey! Would you be willing to answer a few more similar questions?
    1     2  <NA> 
34936 13566 20906 
Class: numeric 

Childhood Socioeconomic Status

# Sanity check: View the counts of each option
table_label(df_merged$childhood_SES)
$childhood_SES
As a child, how would you describe the financial situation in your household compared to a typical home where you grew up?
    1     2     3     4     5  <NA> 
 4564 10353 15104  8711  1219 29457 
Class: numeric 
# Create categorical variable with labels
df_merged <- df_merged |>
  dplyr::mutate(
    childhood_SES_cat = base::factor(
      dplyr::case_when(
        childhood_SES == 1 ~ "Poor",
        childhood_SES == 2 ~ "Below average but not poor",
        childhood_SES == 3 ~ "Around average",
        childhood_SES == 4 ~ "Above average but not wealthy",
        childhood_SES == 5 ~ "Wealthy",
        TRUE ~ NA_character_
    ),
    levels = c(
      "Poor",
      "Below average but not poor",
      "Around average",
      "Above average but not wealthy",
      "Wealthy"
    ),
    ordered = TRUE)) |>
  dplyr::relocate(childhood_SES_cat, .after = childhood_SES)

# Sanity check: View the distribution of the new variable
base::table(df_merged$childhood_SES_cat, useNA = "always")

                         Poor    Below average but not poor                Around average Above average but not wealthy                       Wealthy 
                         4564                         10353                         15104                          8711                          1219 
                         <NA> 
                        29457 

Financial Outlook and Confidence

# Nothing to do here, the item is already numeric and
# within the minimum and maximum values.

table_label(df_merged$fin_outlook)
$fin_outlook
What is your expectation for how things will be for you financially one year from now?
    1     2     3     4     5  <NA> 
 1663  3952 13487 14794  5958 29554 
Class: numeric 
table_label(df_merged$fin_outlook_conf)
$fin_outlook_conf
On a scale from 1 (completely uncertain) to 10 (completely certain), how confident are you in your answer to the last question?
    1     2     3     4     5     6     7     8     9    10  <NA> 
  995   527  1014  1732  4371  4260  6809  8042  4590  7514 29554 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    fin_outlook_cat = base::factor(
      dplyr::case_when(
        fin_outlook == 1 ~ "Things will be much worse",
        fin_outlook == 2 ~ "Things will be somewhat worse",
        fin_outlook == 3 ~ "Things will be about the same",
        fin_outlook == 4 ~ "Things will be somewhat better",
        fin_outlook == 5 ~ "Things will be much better",
        TRUE ~ NA_character_
      ),
      levels = c(
        "Things will be much worse",
        "Things will be somewhat worse",
        "Things will be about the same",
        "Things will be somewhat better",
        "Things will be much better"
      ),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(fin_outlook_cat, .after = fin_outlook)

Attention and Care

# Sanity check: View the counts of each option
table_label(df_merged$attention_care)
$attention_care
There are people that care about and pay attention to what goes on in my life.
    1     2     3     4     5     6     7  <NA> 
  836   838  1949  4431 13507  9044  9214 29589 
Class: numeric 
df_merged <- df_merged |>
  dplyr::mutate(
    attention_care_cat = base::factor(
      dplyr::case_when(
        attention_care == 1 ~ "Completely disagree",
        attention_care == 2 ~ "Strongly disagree",
        attention_care == 3 ~ "Disagree",
        attention_care == 4 ~ "Neutral",
        attention_care == 5 ~ "Agree",
        attention_care == 6 ~ "Strongly agree",
        attention_care == 7 ~ "Completely agree",
        TRUE ~ NA_character_
    ),
    levels = c(
      "Completely disagree",
      "Strongly disagree",
      "Disagree",
      "Neutral",
      "Agree",
      "Strongly agree",
      "Completely agree"
    ),
    ordered = TRUE)) |>
  dplyr::relocate(attention_care_cat, .after = attention_care)

# Sanity check: View the distribution of the new variable
df_merged |>
  dplyr::count(attention_care, attention_care_cat)
# A tibble: 8 × 3
  attention_care attention_care_cat      n
           <dbl> <ord>               <int>
1              1 Completely disagree   836
2              2 Strongly disagree     838
3              3 Disagree             1949
4              4 Neutral              4431
5              5 Agree               13507
6              6 Strongly agree       9044
7              7 Completely agree     9214
8             NA <NA>                29589

Workplace Arragement

# Sanity check: View the counts of each option
table_label(df_merged$work_arrangement)
$work_arrangement
Which most accurately describes your current work (or study) arrangement?
    1     2     3     4     5  <NA> 
17717  6169  3558  3226  2890 35848 
Class: numeric 
# Create categorical variable with labels
df_merged <- df_merged %>%
  dplyr::mutate(
    work_arrangement_cat = base::factor(
      dplyr::case_when(

        work_arrangement == 1
        ~ "I work entirely in-person (i.e., in an office, on-site)",

        work_arrangement == 2
        ~ "I mostly work in-person, with occasional remote days",

        work_arrangement == 3
        ~ "I work about evenly in-person/remote",

        work_arrangement == 4
        ~ "I mostly work remotely, with occasional in-person days",

        work_arrangement == 5
        ~ "I work entirely remotely",

        TRUE ~ NA_character_
      ),
      levels = c(
        "I work entirely in-person (i.e., in an office, on-site)",
        "I mostly work in-person, with occasional remote days",
        "I work about evenly in-person/remote",
        "I mostly work remotely, with occasional in-person days",
        "I work entirely remotely"
      ),
      ordered = TRUE
    ),

    work_arrangement_cat_nostudents = base::factor(
      dplyr::if_else(
        employment_primary == "Student non-working (Full or part-time)",
        NA_character_,
        as.character(work_arrangement_cat)
      ),
      levels = levels(work_arrangement_cat),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(work_arrangement_cat, 
                  work_arrangement_cat_nostudents,
                  .after = work_arrangement)

# Sanity check: View the distribution of the new variable
df_merged |>
  dplyr::group_by(work_arrangement, work_arrangement_cat) |>
  dplyr::summarise(n = dplyr::n())
# A tibble: 6 × 3
# Groups:   work_arrangement [6]
  work_arrangement work_arrangement_cat                                        n
             <dbl> <ord>                                                   <int>
1                1 I work entirely in-person (i.e., in an office, on-site) 17717
2                2 I mostly work in-person, with occasional remote days     6169
3                3 I work about evenly in-person/remote                     3558
4                4 I mostly work remotely, with occasional in-person days   3226
5                5 I work entirely remotely                                 2890
6               NA <NA>                                                    35848
# Sanity check: View the distribution of the new variable excluding students
base::table(df_merged$employment_primary,
            df_merged$work_arrangement_cat_nostudents, useNA = "always")
                                                          
                                                           I work entirely in-person (i.e., in an office, on-site)
  Not in paid employment (by choice/health)                                                                      0
  Not in paid employment (looking for work)                                                                    295
  Student non-working (Full or part-time)                                                                        0
  Employed/working full-time (25+ hours per week)                                                            13053
  Employed/working part-time (less than 25 hours per week)                                                    1898
  Retired                                                                                                        0
  Military service                                                                                             234
  <NA>                                                                                                           0
                                                          
                                                           I mostly work in-person, with occasional remote days I work about evenly in-person/remote
  Not in paid employment (by choice/health)                                                                   0                                    0
  Not in paid employment (looking for work)                                                                 118                                   79
  Student non-working (Full or part-time)                                                                     0                                    0
  Employed/working full-time (25+ hours per week)                                                          4363                                 2251
  Employed/working part-time (less than 25 hours per week)                                                  773                                  581
  Retired                                                                                                     0                                    0
  Military service                                                                                           51                                   48
  <NA>                                                                                                        0                                    0
                                                          
                                                           I mostly work remotely, with occasional in-person days I work entirely remotely  <NA>
  Not in paid employment (by choice/health)                                                                     0                        0  3567
  Not in paid employment (looking for work)                                                                    71                       97  3511
  Student non-working (Full or part-time)                                                                       0                        0  8478
  Employed/working full-time (25+ hours per week)                                                            2051                     1571 10252
  Employed/working part-time (less than 25 hours per week)                                                    586                      632  2061
  Retired                                                                                                       0                        0  2592
  Military service                                                                                             37                       28   559
  <NA>                                                                                                          0                        0  9571

Identification of Sponsored Participants

# Sanity check: View the counts of each option
table_label(df_merged$br)
$br
id
    5  <NA> 
 6445 62963 
Class: numeric 
table_label(df_merged$bs)
$bs
pay
    1  <NA> 
    1 69407 
Class: numeric 
table_label(df_merged$irl)
    0     1  <NA> 
68208  1200     0 
Class: numeric 
# Create a new variable to identify sponsored participants
df_merged <- df_merged |>
  dplyr::mutate(
    sponsored = dplyr::if_else(
      !is.na(br) | !is.na(bs) | irl == 1, 1, 0
    )
  )

# Sanity check
base::table(df_merged$sponsored, useNA = "always")

    0     1  <NA> 
61762  7646     0 

A0.2. Applying exclusion criteria

Direct exclusion criteria

# Identify exclusion criteria and assign status
df_merged <- df_merged |>

  # Create explicit flags for each rule
  dplyr::mutate(
    incomplete = is.na(debts_orig) & irl == 0,

    # E1. Not resident based on manual checking of location validation
    # important to note that the USA version was the default
    # when the survey link was broken or shared without specifying a country
    # in the URL metadata parameters.
    not_resident = loc_resident == 0,

    # E2. Implausible combination of working (3, 4, or 5 on employment)
    # and reporting zero income.
    working_zero_income =
      (stringr::str_detect(employment_orig, "\\b(3|4|5)\\b")) &
      (income_orig == 0 | income_text_clean == 0),

    # E3. Implausible combination of being retired (6 on employment)
    # and having an age <= 25
    retired_young =
      (stringr::str_detect(employment_orig, "\\b6\\b")) &
      (!is.na(age) & age <= 25),

    # E4. Implausible combination of reporting
    # very high MPWB (well-being) and very high PHQ4 (distress)
    extremes_mpwb_phq4 =
      !is.na(gad_worry) &
      (mpwb_sum >= 65 & phq4_sum >= 24),

    # E5. Respondents reporting high MPWB (well-being) and high PHQ-4 (distress),
    # combined with unusually short adjusted completion time.
    high_mpwb_phq4_speed =
      !is.na(gad_worry) &
      !is.na(duration_adj) &
      (mpwb_sum >= 64 & phq4_sum >= 23 & duration_adj < 10),

    # E6. Too-fast based on raw duration,
    # except sponsored participants from Ireland (who don't have duration data)
    too_fast_raw = duration_sec < 150 & irl == 0,

    # E7. We observed a China-specific pattern of
    # unusually fast completion times and low response variance.
    china_too_fast_low_var =
      country == "China" &
      duration_adj < 10 &
      mpwb_var < 1
  ) |>

  # Assign status based on the ordered exclusion criteria
  # (first match is assigned and the rest ignored)
  dplyr::mutate(
    valid_status = base::factor(dplyr::case_when(
      incomplete ~ "incomplete",
      not_resident ~ "not residents",
      working_zero_income ~ "implausible working with no income",
      retired_young ~ "implausible retired young",
      extremes_mpwb_phq4 ~ "implausible extremes",
      high_mpwb_phq4_speed ~ "implausible high scores with speed",
      too_fast_raw ~ "too fast general",
      china_too_fast_low_var ~ "too fast low var",
      TRUE ~ "passed"
    ),
    levels = c(
      "incomplete",
      "not residents",
      "implausible working with no income",
      "implausible retired young",
      "implausible extremes",
      "implausible high scores with speed",
      "too fast general",
      "too fast low var",
      "passed")
    )
  )

# Sanity checks: Overall counts per status
base::table(df_merged$valid_status, useNA = "always")

                        incomplete                      not residents implausible working with no income          implausible retired young 
                             12181                                705                                271                                 39 
              implausible extremes implausible high scores with speed                   too fast general                   too fast low var 
                                48                                  7                               1595                                737 
                            passed                               <NA> 
                             53825                                  0 
# Sanity checks: Check counts for incomplete
df_merged |>
  dplyr::filter(incomplete) |>
  dplyr::group_by(valid_status, Finished, debts_orig, phq_interest) |>
  dplyr::summarise(max_progress = max(Progress), n_incomplete = dplyr::n())
# A tibble: 1 × 6
# Groups:   valid_status, Finished, debts_orig [1]
  valid_status Finished debts_orig phq_interest max_progress n_incomplete
  <fct>           <dbl> <chr>             <dbl>        <dbl>        <int>
1 incomplete          0 <NA>                 NA           77        12181
# Sanity checks: Check counts for not residents
df_merged |>
  dplyr::filter(not_resident & !incomplete) |>
  dplyr::group_by(valid_status, country, loc_country) |>
  dplyr::summarise(n_not_resident = dplyr::n()) |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity checks: Check counts for participants working with zero income
df_merged |>
  dplyr::filter(working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, employment_cat, income_orig, income_text_orig) |>
  dplyr::summarise(n_working_zero_income = dplyr::n()) |>
  dplyr::arrange(-n_working_zero_income)
# A tibble: 15 × 5
# Groups:   valid_status, employment_cat, income_orig [15]
   valid_status                       employment_cat                                                income_orig income_text_orig n_working_zero_income
   <fct>                              <chr>                                                               <dbl> <chr>                            <int>
 1 implausible working with no income Employed/working full-time (25+ hours per week)                         0 <NA>                               119
 2 implausible working with no income Employed/working part-time (less than 25 hours per week)                0 <NA>                                45
 3 implausible working with no income Employed/working full-time (25+ hours per week)                        10 0                                   20
 4 implausible working with no income Full-time student; Employed/working part-time (less than 25 …           0 <NA>                                20
 5 implausible working with no income Full-time student; Employed/working full-time (25+ hours per…           0 <NA>                                14
 6 implausible working with no income Part-time student; Employed/working part-time (less than 25 …           0 <NA>                                13
 7 implausible working with no income Part-time student; Employed/working full-time (25+ hours per…           0 <NA>                                12
 8 implausible working with no income Military service                                                        0 <NA>                                10
 9 implausible working with no income Full-time student; Military service                                     0 <NA>                                 6
10 implausible working with no income Employed/working part-time (less than 25 hours per week)               10 0                                    3
11 implausible working with no income Part-time student; Military service                                     0 <NA>                                 3
12 implausible working with no income Employed/working full-time (25+ hours per week); Military se…           0 <NA>                                 2
13 implausible working with no income Military service                                                       10 0                                    2
14 implausible working with no income Full-time student; Employed/working full-time (25+ hours per…          10 0                                    1
15 implausible working with no income Part-time student; Employed/working part-time (less than 25 …          10 0                                    1
# Sanity checks: Check counts for retired young participants
df_merged |>
  dplyr::filter(retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, employment_orig, age_group) |>
  dplyr::summarise(n_retired_young = dplyr::n()) |>
  dplyr::arrange(-n_retired_young)
# A tibble: 4 × 4
# Groups:   valid_status, employment_orig [4]
  valid_status              employment_orig age_group n_retired_young
  <fct>                     <chr>           <fct>               <int>
1 implausible retired young 6               18-25                  27
2 implausible retired young 1,6             18-25                   6
3 implausible retired young 2,6             18-25                   2
4 implausible retired young 3,6             18-25                   2
# Sanity checks: Check counts for extremes in MPWB and PHQ4
df_merged |>
  dplyr::filter(extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, mpwb_sum, phq4_sum) |>
  dplyr::summarise(n_extremes_mpwb_phq4 = dplyr::n()) |>
  dplyr::arrange(-n_extremes_mpwb_phq4)
# A tibble: 6 × 4
# Groups:   valid_status, mpwb_sum [3]
  valid_status         mpwb_sum phq4_sum n_extremes_mpwb_phq4
  <fct>                   <dbl>    <dbl>                <int>
1 implausible extremes       70       28                   12
2 implausible extremes       69       28                    2
3 implausible extremes       67       25                    1
4 implausible extremes       69       24                    1
5 implausible extremes       70       25                    1
6 implausible extremes       70       26                    1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
  dplyr::filter(high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, mpwb_sum, phq4_sum, duration_adj) |>
  dplyr::summarise(n_high_mpwb_phq4_speed = dplyr::n()) |>
  dplyr::arrange(-n_high_mpwb_phq4_speed)
# A tibble: 3 × 5
# Groups:   valid_status, mpwb_sum, phq4_sum [3]
  valid_status                       mpwb_sum phq4_sum duration_adj n_high_mpwb_phq4_speed
  <fct>                                 <dbl>    <dbl>        <dbl>                  <int>
1 implausible high scores with speed       64       23         7.5                       1
2 implausible high scores with speed       64       28         5.32                      1
3 implausible high scores with speed       66       23         6.04                      1
# Sanity checks: Check counts for participants with high scores on mpwb and phq4,
# plus unusual speed
df_merged |>
  dplyr::filter(too_fast_raw & !high_mpwb_phq4_speed &
                  !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status) |>
  dplyr::summarise(
    min(duration_sec), max(duration_sec), n_too_fast_raw = dplyr::n())
# A tibble: 1 × 4
  valid_status     `min(duration_sec)` `max(duration_sec)` n_too_fast_raw
  <fct>                          <dbl>               <dbl>          <int>
1 too fast general                  48                 149            718
# Sanity checks: Check counts for China-specific exclusion
df_merged |>
  dplyr::filter(china_too_fast_low_var & !too_fast_raw &
                  !high_mpwb_phq4_speed & !extremes_mpwb_phq4 & !retired_young &
                  !working_zero_income & !not_resident & !incomplete) |>
  dplyr::group_by(valid_status, country) |>
  dplyr::summarise(
    min(mpwb_var), max(mpwb_var),
    min(duration_adj), max(duration_adj), n_china_too_fast_low_var = dplyr::n())
# A tibble: 1 × 7
# Groups:   valid_status [1]
  valid_status     country `min(mpwb_var)` `max(mpwb_var)` `min(duration_adj)` `max(duration_adj)` n_china_too_fast_low_var
  <fct>            <chr>             <dbl>           <dbl>               <dbl>               <dbl>                    <int>
1 too fast low var China                 0           0.989                   5                9.95                      439
# Clean data to only include "passed" participants
df_clean <- df_merged |>
  dplyr::filter(valid_status == "passed")

Assessments to the Financial variables

Collaborators reviewed the financial variables and created flags indicating whether the responses were valid or not. Basic demographic information about the participants was given only upon request to assist with the review. The income, assets, and debts values that fell within the first income bracket and the last bracket were considered valid by default.

In Zimbabwe, all values were sent for review because there was a concern that participants reported values in Zimbabwean dollar instead of USD as collaborators used in the translation. Also, 14 participants from USA with a value equal to the first income bracket should have been accepted automatically but were sent for review by mistake.

Collaborators were also asked to provide a minimum cut-off for each variable. When the minimum cut-off was higher than the first income bracket, their sheet was updated with the values between the first income bracket and the minimum cut-off. Values of 0 in either financial variable were automatically accepted as is and were not given to collaborators for revision. The values that contained NA, “,” or “.” were also requested for review in order to validate our cleaning script.

The countries where sociodemographic information were provided were: Albania, Bangladesh, Finland, Georgia, Japan, Latvia, Lebanon, Oman, Peru, Portugal, Qatar, Russia, Singapore, Switzerland, Timor-Leste, Ukraine, USA, and Zimbabwe.

This assessment was not conducted for the sponsored participants from Ireland, as they did not provide open field answers regarding income, and were not asked to report assets and debts.

# A manual revision of the values was conducted before the sheet was given to
# collaborators.

df_clean <- df_clean |>
  dplyr::mutate(
    fin_valid_aut_income =
      dplyr::case_when(

      # For participants that selected a decile
      # instead of providing an open text answer, consider them accepted
      income_orig < 10 ~ 1,

      # Values of 0 are automatically accepted as is.
      income_text_clean == 0 ~ 1,

      # If value contains "," or "." or other non-digit, consider them not accepted,
      # so collaborators can review them.
      !(stringr::str_detect(income_text_orig, "^[0-9]+$")) ~ 0,

      # If we detected a weird number, consider them not accepted.
      income_wrd ~ 0,

      # If value is above 0 but below the income first bracket,
      # consider them not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean > 0 & income_text_clean < income_highpoint_1 ~ 0,

      # For all other participants, execute automatic assessment:
      # The values that were within the income first bracket
      # and the value of the last income bracket were considered not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean >= income_highpoint_1 &
      income_text_clean <= income_lowpoint_9 ~ 1,

      # For values above the last income bracket, consider them not accepted.
      irl == 0 &
      !is.na(income_text_orig) &
      income_text_clean != 0 &
      income_text_clean > income_lowpoint_9 ~ 0,

      TRUE ~ NA_real_
    ),

    fin_valid_aut_assets =
      dplyr::case_when(

      # Sponsored participants from Ireland are assigned NA
      # because no open text answers were collected from them.
      irl == 1 ~ NA_real_,
      assets_clean == 0 ~ 1,

      !(stringr::str_detect(assets_orig, "^[0-9]+$")) ~ 0,

      assets_wrd ~ 0,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean > 0 & assets_clean < income_highpoint_1) ~ 0,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean != 0 &
      assets_clean >= income_highpoint_1 &
      assets_clean <= income_lowpoint_9) ~ 1,

      irl == 0 &
      (!is.na(assets_orig) &
      assets_clean != 0 &
      assets_clean > income_lowpoint_9) ~ 0,

      TRUE ~ NA_real_
    ),

    fin_valid_aut_debts =
      dplyr::case_when(
      irl == 1 ~ NA_real_,
      debts_clean == 0 ~ 1,

      !(stringr::str_detect(debts_orig, "^[0-9]+$")) ~ 0,

      debts_wrd ~ 0,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean > 0 & debts_clean < income_highpoint_1) ~ 0,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean != 0 &
      debts_clean >= income_highpoint_1 &
      debts_clean <= income_lowpoint_9) ~ 1,

      irl == 0 &
      (!is.na(debts_orig) &
      debts_clean != 0 &
      debts_clean > income_lowpoint_9) ~ 0,

      TRUE ~ NA_real_
    )
  )

# Examine if the minimum cut-off provided is higher than the first income bracket.
df_clean <- df_clean |>
  dplyr::mutate(

    income_above_cutoff = income_cutoff_min > income_highpoint_1,
    assets_above_cutoff = assets_cutoff_min > income_highpoint_1,
    debts_above_cutoff = debts_cutoff_min > income_highpoint_1,

    fin_valid_aut_income_update =
      dplyr::case_when(
        income_above_cutoff == FALSE ~ fin_valid_aut_income,

        income_above_cutoff == TRUE &
        income_text_clean >= income_highpoint_1 &
        income_text_clean < income_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_income
    ),

    fin_valid_aut_assets_update =
      dplyr::case_when(
        assets_above_cutoff == FALSE ~ fin_valid_aut_assets,

        assets_above_cutoff == TRUE &
        assets_clean >= income_highpoint_1 &
        assets_clean < assets_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_assets
    ),

    fin_valid_aut_debts_update =
      dplyr::case_when(
        debts_above_cutoff == FALSE ~ fin_valid_aut_debts,

        debts_above_cutoff == TRUE &
        debts_clean >= income_highpoint_1 &
        debts_clean < debts_cutoff_min ~ 0,

        TRUE ~ fin_valid_aut_debts
    )
  )

# Sanity check: View the counts of automatic financial validity
base::table(df_clean$fin_valid_aut_income, useNA = "always")

    0     1  <NA> 
 3185 50531   109 
base::table(df_clean$fin_valid_aut_assets, useNA = "always")

    0     1  <NA> 
27728 24897  1200 
base::table(df_clean$fin_valid_aut_debts, useNA = "always")

    0     1  <NA> 
13503 39122  1200 

After we transmitted the values that were not automatically classified to collaborators in each country for review, we received back their assessments. We have extracted automatically the sheet with their assessments, and combined them into a single file.

# Extract sections from Excel files in folder "777_countries_documentation"
files <- list.files(
  path = "777_countries_documentation",
  pattern = "\\.xls[x]?$",
  full.names = TRUE) |>
  purrr::discard(
    # Exclude files named 777_Zambia and 777_Global
    ~stringr::str_detect(basename(.x),"^777_(Zambia|Global)"))

process_sheet <- function(path, sheet_name, start_row, tab_label) {
  sheet_all <- readxl::read_excel(path, sheet = sheet_name, col_names = FALSE)
  section <- sheet_all |> dplyr::slice(start_row:nrow(sheet_all)) |> dplyr::select(1:9)
  # drop header row
  section <- section |> dplyr::slice(-1)
  names(section) <- c(
    "ResponseId",
    "UserLanguage",
    "orig",
    "clean",
    "classification",
    "value",
    "cutoff_max",
    "cutoff_min",
    "notes"
  )
    section <- section |>
      dplyr::mutate(
      file = tools::file_path_sans_ext(basename(path)),
      tab = tab_label
    )
  section
}

assessment_fin <- purrr::map_dfr(files, function(path) {
  d1 <- process_sheet(path, "HOUSEHOLD INCOME", 22, "income")
  d2 <- process_sheet(path, "ASSETS", 9, "assets")
  d3 <- process_sheet(path, "DEBTS", 9, "debts")
  dplyr::bind_rows(d1, d2, d3)
}) |>
  dplyr::rename(
    change = value
  ) |>
  dplyr::mutate(
    clean = base::as.numeric(clean),
    cutoff_max = base::as.numeric(cutoff_max),
    cutoff_min = base::as.numeric(cutoff_min)
  ) |>
  tidyr::pivot_wider(
    id_cols = c("ResponseId", "UserLanguage"),
    names_from = "tab",
    values_from = c(
      "change",
      "classification",
      "cutoff_min",
      "cutoff_max",
      "orig",
      "clean"
    ),
    names_sep = "_"
  )
# Sanity check: View the assessment_fin data frame
dplyr::glimpse(assessment_fin, width = 100)
Rows: 33,245
Columns: 20
$ ResponseId            <chr> "R_2S9d1LQe5gzhMGp", "R_2duaXZQf76tNTnX", "R_8YEzJo4GF1VSJiU", "R_8D…
$ UserLanguage          <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "S…
$ change_income         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_assets         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ change_debts          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ classification_income <chr> "Not possible/not believable", "Not possible/not believable", "Not p…
$ classification_assets <chr> "OK", "OK", NA, "OK", "OK", "OK", "OK", "OK", "OK", NA, NA, NA, NA, …
$ classification_debts  <chr> "OK", NA, "Cannot determine", NA, "Cannot determine", NA, NA, NA, NA…
$ cutoff_min_income     <dbl> 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000, 12000…
$ cutoff_min_assets     <dbl> 1000, 1000, NA, 1000, 1000, 1000, 1000, 1000, 1000, NA, NA, NA, NA, …
$ cutoff_min_debts      <dbl> 1000, NA, 1000, NA, 1000, NA, NA, NA, NA, NA, NA, NA, NA, 1000, NA, …
$ cutoff_max_income     <dbl> 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 2500000, 25000…
$ cutoff_max_assets     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cutoff_max_debts      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ orig_income           <chr> "100", "410", "500", "600", "700", "800", "900", "1200", "2000", "30…
$ orig_assets           <chr> "10000", "100000", NA, "10000", "5000", "5000", "3000", "500.000", "…
$ orig_debts            <chr> "5000", NA, "500", NA, "250", NA, NA, NA, NA, NA, NA, NA, NA, "30000…
$ clean_income          <dbl> 100, 410, 500, 600, 700, 800, 900, 1200, 2000, 3000, 3840, 5000, 700…
$ clean_assets          <dbl> 10000, 100000, NA, 10000, 5000, 5000, 3000, 500000, 2000, NA, NA, NA…
$ clean_debts           <dbl> 5000, NA, 500, NA, 250, NA, NA, NA, NA, NA, NA, NA, NA, 30000, NA, N…
# Sanity check: Compare the cut-off min values
fin_cut_income <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_income) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_income)) |> dplyr::rename(income_cutoff_min = cutoff_min_income)

df_cut_income <- df_clean |> dplyr::group_by(UserLanguage, income_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(income_cutoff_min) & UserLanguage %in% fin_cut_income$UserLanguage)

dplyr::setequal(fin_cut_income, df_cut_income)
[1] TRUE
fin_cut_assets <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_assets) |> dplyr::summarise() |> dplyr::filter(!is.na(cutoff_min_assets)) |> dplyr::rename(assets_cutoff_min = cutoff_min_assets)

df_cut_assets <- df_clean |> dplyr::group_by(UserLanguage, assets_cutoff_min) |> dplyr::summarise() |> dplyr::filter(!is.na(assets_cutoff_min) & UserLanguage %in% fin_cut_assets$UserLanguage)

dplyr::setequal(fin_cut_assets, df_cut_assets)
[1] TRUE
fin_cut_debts <- assessment_fin |> dplyr::group_by(UserLanguage, cutoff_min_debts) |> summarise() |> dplyr::filter(!is.na(cutoff_min_debts)) |> dplyr::rename(debts_cutoff_min = cutoff_min_debts)

df_cut_debts <- df_clean |> dplyr::group_by(UserLanguage, debts_cutoff_min) |> dplyr::summarise() |> filter(!is.na(debts_cutoff_min) & UserLanguage %in% fin_cut_debts$UserLanguage)

dplyr::setequal(fin_cut_debts, df_cut_debts)
[1] TRUE
# Sanity check:
# Are there any UserLanguage in assessment_fin that are not in df_merged?
base::setdiff(
  unique(assessment_fin$UserLanguage),
  unique(df_merged$UserLanguage)
)
character(0)
# Sanity check:
# All values between clean_income in assessment_fin
# and income_text_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_income) |> filter(!is.na(clean_income)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, income_text_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_income == income_text_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE   3183
# All values between clean_assets in assessment_fin
# and assets_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_assets) |> filter(!is.na(clean_assets)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, assets_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_assets == assets_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE  27351
# All values between clean_debts in assessment_fin
# and debts_clean in df_clean match?
assessment_fin |> select(ResponseId, clean_debts) |> filter(!is.na(clean_debts)) |>
  dplyr::left_join(
    df_clean |> select(ResponseId, debts_clean),
    by = "ResponseId"
  ) |>
  dplyr::mutate(match = clean_debts == debts_clean) |> dplyr::group_by(match) |> dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  match     n
  <lgl> <int>
1 TRUE  12860
# Sanity check: Any duplicated ResponseId in assessment_fin?
assessment_fin |>
  dplyr::count(ResponseId) |>
  dplyr::filter(n > 1) |>
  base::nrow()
[1] 0
# Join assessments to main data frame
nrow(df_clean)
[1] 53825
df_clean <- df_clean |>
  dplyr::left_join(
    assessment_fin |> dplyr::select(
      ResponseId,
      classification_income,
      change_income,
      classification_assets,
      change_assets,
      classification_debts,
      change_debts
    ),
    by = c("ResponseId")
  ) |>

  # Apply the changes recommended by collaborators
  dplyr::mutate(
    income_text_reviewed = dplyr::case_when(
      !is.na(classification_income) &
      stringr::str_detect(classification_income, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_income),
      TRUE ~ income_text_clean
    ),
    assets_reviewed = dplyr::case_when(
      !is.na(classification_assets) &
      stringr::str_detect(classification_assets, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_assets),
      TRUE ~ assets_clean
    ),
    debts_reviewed = dplyr::case_when(
      !is.na(classification_debts) &
      stringr::str_detect(classification_debts, 
                          "Change to: \\[add value on column F\\]") 
      ~ as.numeric(change_debts),
      TRUE ~ debts_clean
    )
  )

nrow(df_clean)
[1] 53825
# Sanity checks: View the counts of cells that were automatically approved 
# and were still reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean, 
                          classification_income, fin_valid_aut_income,
                          fin_valid_aut_income_update) |> 
  dplyr::filter(fin_valid_aut_income_update == 1 & !is.na(classification_income) 
                & income_text_clean > 0) |> 
  dplyr::group_by(country, classification_income) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean, 
                          classification_assets, fin_valid_aut_assets,
                          fin_valid_aut_assets_update) |>
  dplyr::filter(fin_valid_aut_assets_update==1 & !is.na(classification_assets) 
                & assets_clean > 0) |> 
  dplyr::group_by(country, classification_assets) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 3 × 3
# Groups:   country [2]
  country  classification_assets     n
  <chr>    <chr>                 <int>
1 USA      Cannot determine          2
2 USA      OK                       94
3 Zimbabwe OK                       82
df_clean |> dplyr::select(ResponseId, country, debts_clean, 
                          classification_debts, fin_valid_aut_debts,
                          fin_valid_aut_debts_update) |>
  dplyr::filter(fin_valid_aut_debts_update== 1 & !is.na(classification_debts) &
                  debts_clean > 0) |> 
  dplyr::group_by(country, classification_debts) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 3 × 3
# Groups:   country [2]
  country  classification_debts                   n
  <chr>    <chr>                              <int>
1 USA      OK                                    14
2 Zimbabwe Change to: [add value on column F]    20
3 Zimbabwe OK                                    58
# Sanity checks: View the counts of cells that were automatically disapproved 
# and were not reviewed by collaborators
df_clean |> dplyr::select(ResponseId, country, income_text_clean, 
                          classification_income, fin_valid_aut_income,
                          fin_valid_aut_income_update) |>
  dplyr::filter(fin_valid_aut_income_update == 0 & is.na(classification_income) 
                & income_text_clean > 0) |> 
  dplyr::group_by(country, classification_income) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, assets_clean, 
                          classification_assets, fin_valid_aut_assets,
                          fin_valid_aut_assets_update) |>
  dplyr::filter(fin_valid_aut_assets_update == 0 & is.na(classification_assets) 
                & assets_clean > 0) |> 
  dplyr::group_by(country, classification_assets) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
df_clean |> dplyr::select(ResponseId, country, debts_clean, classification_debts,
                          fin_valid_aut_debts, fin_valid_aut_debts_update) |>
  dplyr::filter(fin_valid_aut_debts_update == 0 & is.na(classification_debts) 
                & debts_clean > 0) |> 
  dplyr::group_by(country, classification_debts) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::nrow()
[1] 0
# Create variable where we fit the open field answers into the brackets
find_decile <- function(lang, income_val) {
  # If value is missing, return NA for this row
  if (is.na(income_val)) {
    return(NA_real_)
  }

  # Subset brackets for language
  brackets <- income_gaps[income_gaps$UserLanguage == lang, ]

  # If no brackets available for this language, return NA
  if (nrow(brackets) == 0) {
    return(NA_real_)
  }

  for (j in seq_len(nrow(brackets))) {
    low  <- brackets$income_lowpoint_adj[j]
    high <- brackets$income_highpoint_adj[j]

    # Skip rows with missing low
    if (is.na(low)) {
      next
    }

    # Open-ended bracket: [low, ∞)
    if (is.na(high)) {
      if (income_val >= low) {
        return(base::as.numeric(brackets$income_orig[j]))
      } else {
        next
      }
    }

    # Interval [low, high] inclusive
    if (income_val >= low && income_val <= high) {
      return(base::as.numeric(brackets$income_orig[j]))
    }
  }

  # If higher than all defined brackets, assign 9 by your current rule
  9
}

df_clean <- df_clean |>
  dplyr::mutate(
    income_merg = dplyr::case_when(
      
      is.na(income_orig) ~ NA_real_, 
      
      !is.na(income_orig) & income_orig != 10 ~ income_orig,
      
      income_orig == 10 & is.na(income_text_reviewed) ~ NA_real_,
      
      income_orig == 10 &
      !is.na(income_text_reviewed) & 
      income_text_reviewed == 0 ~ 0,
      
      TRUE ~ purrr::map2_dbl(
        UserLanguage,
        income_text_reviewed,
        find_decile
      )
    ),
    income_merg_cat = base::factor(
      dplyr::case_when(
        income_merg == 0 ~ "No income",
        income_merg == 1 ~ "Second decile",
        income_merg == 2 ~ "Third decile",
        income_merg == 3 ~ "Fourth decile",
        income_merg == 4 ~ "Fifth decile",
        income_merg == 5 ~ "Sixth decile",
        income_merg == 6 ~ "Seventh decile",
        income_merg == 7 ~ "Eighth decile",
        income_merg == 8 ~ "Ninth decile",
        income_merg == 9 ~ "Tenth decile",
        TRUE ~ NA_character_
      ),
      levels = c(
        "No income",
        "Second decile",
        "Third decile",
        "Fourth decile",
        "Fifth decile",
        "Sixth decile",
        "Seventh decile",
        "Eighth decile",
        "Ninth decile",
        "Tenth decile"
      ),
      ordered = TRUE
    ),
    income_merg_group =  base::factor(
      dplyr::case_when(
        income_merg_cat == "No income"
        ~ "No income",
        income_merg_cat %in% c("Second decile", "Third decile", "Fourth decile")
        ~ "Low",
        income_merg_cat %in% c("Fifth decile", "Sixth decile")
        ~ "Mid",
        income_merg_cat %in% c("Seventh decile", "Eighth decile", "Ninth decile")
        ~ "Upper",
        income_merg_cat == "Tenth decile"
        ~ "Wealthiest",
        TRUE ~ NA_character_
      ),
      levels = c("No income", "Low", "Mid", "Upper", "Wealthiest"),
      ordered = TRUE
    )
  )

df_clean <- df_clean |>
  dplyr::left_join(
    income_gaps |>
      dplyr::select(
        UserLanguage,
        income_orig,
        income_lowpoint_adj,
        income_highpoint_adj
      ),
    by = c("UserLanguage", "income_merg" = "income_orig")
  ) |>
  dplyr::mutate(
    income_merg_translated = dplyr::case_when(
      is.na(income_merg) ~ NA_character_,

      income_merg == 0 ~ "0",

      # Closed interval [low, high]
      !is.na(income_lowpoint_adj) &
      !is.na(income_highpoint_adj)
      ~ paste0(
          income_lowpoint_adj,
          " - ",
          income_highpoint_adj
        ),

      # Open upper bound [low, ∞)
      !is.na(income_lowpoint_adj) &
      is.na(income_highpoint_adj)
      ~ paste0(
          income_lowpoint_adj,
          "+"
        ),

      TRUE ~ NA_character_
    )
  )

# Sanity checks: View counts of merged income variable
df_clean |>
  dplyr::filter(
    income_orig == 10,
    !is.na(income_text_reviewed)
  ) |>
  dplyr::group_by(
    UserLanguage,
    income_merg,
    income_merg_translated
  ) |>
  dplyr::summarise(
    min_income_text_reviewed = min(income_text_reviewed, na.rm = TRUE),
    max_income_text_reviewed = max(income_text_reviewed, na.rm = TRUE)
  ) |>
  print_reactable(sorted_col = "UserLanguage", width = 800)
df_clean |> 
  dplyr::group_by(income_orig, income_merg) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::print(n = Inf)
# A tibble: 21 × 3
# Groups:   income_orig [12]
   income_orig income_merg     n
         <dbl>       <dbl> <int>
 1           0           0  1866
 2           1           1  4775
 3           2           2  5754
 4           3           3  5961
 5           4           4  5778
 6           5           5  5195
 7           6           6  4646
 8           7           7  4583
 9           8           8  3585
10           9           9  5347
11          10           0    91
12          10           1  1951
13          10           2   834
14          10           3   717
15          10           4   583
16          10           5   443
17          10           6   356
18          10           7   297
19          10           8   208
20          10           9   746
21          NA          NA   109
# Cleanup
rm(fin_cut_income, fin_cut_debts, fin_cut_assets, find_decile,
   assessment_fin, process_sheet, files, df_cut_assets, df_cut_debts, 
   df_cut_income)

Red flag exclusion

Each flag corresponds to a specific pattern that may indicate low-quality data. This process was only applied to participants who passed the direct exclusion criteria.

# Identified participants with IP addresses known to be associated with botnets.
botnet_ids <-
  readr::read_csv("111_response_ids_botnets.csv", show_col_types = FALSE) |>
  dplyr::pull(ResponseId) |>
  base::trimws(); length(botnet_ids)
[1] 262
# Identified participants with IP addresses massively repeated
# across multiple responses.
massive_rep_ids <-
  readr::read_csv("111_ip_repeated.csv", show_col_types = FALSE) |>
  dplyr::pull(ResponseId) |>
  base::trimws(); length(massive_rep_ids)
[1] 10300
# Start the flagging process.
df_flagged <- df_clean |>
  dplyr::mutate(

    # F1. Household >=4 and zero income
    flag_hh4_zero_income =
      dplyr::if_else(
          irl == 0 &
          household_size >= 4 &
          income_merg == 0,
        1,
        0,
        missing = NA_real_
      ),

    # F2. Any financial items not valid
    flag_fin_invalid =
      dplyr::if_else(
          irl == 0 &
        (
          (!is.na(classification_assets) &
             !(classification_assets %in% c("OK", "Change to: [add value on column F]"))) |
          (!is.na(classification_debts) &
             !(classification_debts %in% c("OK", "Change to: [add value on column F]"))) |
          (!is.na(classification_income) &
             !(classification_income %in% c("OK", "Change to: [add value on column F]")))
        ),
        1,
        0,
        missing = NA_real_
      ),

    # F3. Low variance in MPWB, life satisfaction = 10,
    # and no income or very low education
    flag_ls10_noincome_var =
      dplyr::if_else(
          irl == 0 &
          mpwb_var < 1 &
          life_satisfaction == 10 &
          (education_recoded == 1 |
           income_merg == 0),
        1,
        0,
        missing = NA_real_
      ),

    # F4. Assets and debts are the same value (excluding both zero and NA)
    flag_assets_debts_same =
      dplyr::if_else(
          irl == 0 &
          !is.na(assets_clean) &
          !is.na(debts_clean) &
          assets_clean == debts_clean &
          !(assets_clean == 0 & debts_clean == 0),
        1,
        0,
        missing = NA_real_
      ),

    # F5. Full-time student and lowest education level
    # (Peru participants that selected inclusive education are exempt because
    # they have NA in education_recoded)
    flag_student_lowedu =
      dplyr::if_else(
          !is.na(education_recoded) &
          education_recoded == 1 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b1\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F6. Zero variance in MPWB items
    flag_mpwb_zerovar =
      dplyr::if_else(
        !is.na(mpwb_var) & mpwb_var == 0,
        0.5,
        0,
        missing = NA_real_
      ),

    # F7. Nonsensical sex or ethnicity
    flag_nonsensical_sex_ethn =
      dplyr::if_else(
          irl == 0 &
          (
            (!is.na(sex_text_recoded) & sex_text_recoded == "Cannot determine") |
            (!is.na(ethnicity_specify) & ethnicity_specify == "Cannot determine")
          ),
        1,
        0,
        missing = NA_real_
      ),

    # F8. High MPWB and high PHQ-4
    flag_high_mpwb_phq4 =
      dplyr::if_else(
          irl == 0 &
          !is.na(gad_worry) &
          mpwb_sum >= 60 &
          phq4_sum >= 20 &
          duration_adj < 10,
        1,
        0,
        missing = NA_real_
      ),

    # F9. LS vs mean MPWB mismatch
    flag_ls_vs_mpwb =
      dplyr::case_when(
        base::abs(life_satisfaction - mpwb_mean) > 5 ~ 2,
        base::abs(life_satisfaction - mpwb_mean) > 4 ~ 1,
        TRUE ~ 0
      ),

    # F10. Age >75 and working or studying
    flag_age75_workstudy =
      dplyr::if_else(
          age >= 75 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b(1|2|3|4|5)\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F11. Advanced education and <22 years
    flag_young_advance =
      dplyr::if_else(
          !is.na(education_recoded) &
          education_recoded == 5 &
          age < 22,
        1,
        0,
        missing = NA_real_
      ),

    # F12. Short duration and low MPWB variance
    flag_duration_var =
      dplyr::if_else(
          irl == 0 &
          duration_adj < 10 &
          mpwb_var < 1,
        1,
        0,
        missing = NA_real_
      ),

    # F13. Independent, <20, and richest income
    flag_young_rich_alone =
      dplyr::if_else(
          household_size == 1 &
          age < 20 &
          income_merg >= 8,
        1,
        0,
        missing = NA_real_
      ),

    # F14. Retired and working at the same time
    flag_retired_working =
      dplyr::if_else(
          irl == 0 &
          !is.na(employment_orig) &
          stringr::str_detect(employment_orig, "\\b6\\b") &
          stringr::str_detect(employment_orig, "\\b(3|4|5)\\b"),
        1,
        0,
        missing = NA_real_
      ),

    # F15. Strange numbers in financial variables
    # If collaborators already marked the value as not OK,
    # then there is no need to repeat this flag.
    flag_weird_nr =
      dplyr::case_when(
          irl == 0 &
          (
            (assets_wrd & classification_assets == "OK") |
            (debts_wrd & classification_debts == "OK") |
            (income_wrd & classification_income == "OK")
          )
          ~ 1,
        TRUE ~ 0
      ),

    # F16. Botnet ResponseIds
    flag_botnet =
      dplyr::if_else(
          ResponseId %in% botnet_ids,
        1,
        0,
        missing = NA_real_
      ),

    # F17. Massive repetition of IP + short duration + low variance
    flag_rep =
      dplyr::if_else(
          ResponseId %in% massive_rep_ids &
          duration_adj < 10 &
          mpwb_var < 1,
        1,
        0,
        missing = NA_real_
      ),

    # Total flags
    flag_total =
      base::rowSums(
        dplyr::across(dplyr::starts_with("flag_")),
        na.rm = TRUE
      ),

    # Exclusion flag
    exclusion_flags =
      dplyr::if_else(
        flag_total > 4,
        1,
        0
      ),

    # Update valid_status
    valid_status = base::as.character(valid_status),
    valid_status =
      base::factor(
        dplyr::case_when(
          exclusion_flags == 1 ~ "flagged",
          TRUE ~ valid_status
        ),
        levels = c(
          "flagged",
          "passed"
        )
      )
  )

# Sanity Check: View the counts of exclusion flags
table(df_flagged$valid_status, df_flagged$exclusion_flags, useNA = "always")
         
              0     1  <NA>
  flagged     0    26     0
  passed  53799     0     0
  <NA>        0     0     0
# Sanity Check: View the counts and percentages of each flag
df_flagged |>
  dplyr::select(dplyr::starts_with("flag_")) |>
  dplyr::summarise(
    dplyr::across(
      dplyr::everything(),
      ~ sum((!is.na(.) & . != 0))
    )
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::everything(),
    names_to = "flag",
    values_to = "n_flagged"
  ) |>
  dplyr::mutate(
    percent_flagged = 100 * n_flagged / nrow(df_flagged)
  ) |>
  dplyr::arrange(dplyr::desc(percent_flagged))
# A tibble: 18 × 3
   flag                      n_flagged percent_flagged
   <chr>                         <int>           <dbl>
 1 flag_total                    16396         30.5   
 2 flag_duration_var              6935         12.9   
 3 flag_fin_invalid               4366          8.11  
 4 flag_ls_vs_mpwb                3146          5.84  
 5 flag_mpwb_zerovar              1804          3.35  
 6 flag_nonsensical_sex_ethn      1052          1.95  
 7 flag_assets_debts_same          703          1.31  
 8 flag_rep                        621          1.15  
 9 flag_hh4_zero_income            493          0.916 
10 flag_retired_working            337          0.626 
11 flag_ls10_noincome_var          215          0.399 
12 flag_botnet                     202          0.375 
13 flag_student_lowedu             162          0.301 
14 flag_weird_nr                   124          0.230 
15 flag_age75_workstudy             70          0.130 
16 flag_young_advance               50          0.0929
17 flag_high_mpwb_phq4              29          0.0539
18 flag_young_rich_alone             8          0.0149
# Sanity check: View combinations of classifications where flag_hh4_zero_income is raised
df_flagged |> 
  dplyr::filter(flag_hh4_zero_income == 1) |> 
  dplyr::group_by(household_size, income_merg, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 15 × 4
# Groups:   household_size, income_merg [15]
   household_size income_merg   irl     n
            <dbl>       <dbl> <dbl> <int>
 1              4           0     0   196
 2              5           0     0   120
 3              6           0     0    68
 4              7           0     0    41
 5              8           0     0    20
 6              9           0     0    10
 7             10           0     0    15
 8             11           0     0     1
 9             12           0     0     4
10             13           0     0     3
11             14           0     0     3
12             15           0     0     2
13             17           0     0     1
14             18           0     0     1
15             20           0     0     8
# Sanity check: View combinations of classifications where flag_fin_invalid is raised
df_flagged |> 
  dplyr::filter(flag_fin_invalid == 1) |> 
  dplyr::group_by(classification_income, classification_assets, classification_debts, irl) |> 
  dplyr::summarise(n = dplyr::n())  |>
  print_reactable(sorted_col = "classification_income", width = 800)
# Sanity check: View combinations of classifications where flag_ls10_noincome_var is raised
df_flagged |> 
  dplyr::filter(flag_ls10_noincome_var == 1) |> 
  dplyr::group_by(mpwb_var, life_satisfaction, education_recoded, income_merg, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "mpwb_var", width = 800)
# Sanity check: View combinations of classifications where flag_assets_debts_same is raised
df_flagged |> 
  dplyr::filter(flag_assets_debts_same == 1) |> 
  dplyr::group_by(assets_clean, debts_clean, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "assets_clean", width = 500)
# Sanity check: View combinations of classifications where flag_student_lowedu is raised
df_flagged |> 
  dplyr::filter(flag_student_lowedu == 1) |> 
  dplyr::group_by(education_recoded, employment_orig) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 8 × 3
# Groups:   education_recoded [1]
  education_recoded employment_orig     n
              <dbl> <chr>           <int>
1                 1 1                 130
2                 1 1,3                 7
3                 1 1,4                 7
4                 1 1,5                 1
5                 1 1,6                 1
6                 1 1,7                 3
7                 1 1,8                 6
8                 1 1,9                 7
# Sanity check: View combinations of classifications where flag_mpwb_zerovar is raised
df_flagged |> 
  dplyr::filter(flag_mpwb_zerovar == 0.5) |> 
  dplyr::group_by(mpwb_var) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 2
  mpwb_var     n
     <dbl> <int>
1        0  1804
# Sanity check: View combinations of classifications where flag_nonsensical_sex_ethn is raised
df_flagged |> 
  dplyr::filter(flag_nonsensical_sex_ethn == 1) |> 
  dplyr::group_by(sex_text_recoded, ethnicity_specify, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 10 × 4
# Groups:   sex_text_recoded, ethnicity_specify [10]
   sex_text_recoded ethnicity_specify                              irl     n
   <chr>            <chr>                                        <dbl> <int>
 1 Cannot determine Cannot determine                                 0    14
 2 Cannot determine Cypriot                                          0     1
 3 Cannot determine Other                                            0     5
 4 Cannot determine Roma                                             0     1
 5 Cannot determine The 4 Rs: (Nkole, Kiga, Batooro and Banyoro)     0     1
 6 Cannot determine White                                            0     1
 7 Cannot determine <NA>                                             0    81
 8 Male             Cannot determine                                 0     2
 9 Non-binary       Cannot determine                                 0     6
10 <NA>             Cannot determine                                 0   940
# Sanity check: View combinations of classifications where flag_high_mpwb_phq4 is raised
df_flagged |> 
  dplyr::filter(flag_high_mpwb_phq4 == 1) |> 
  dplyr::group_by(mpwb_sum, phq4_sum, gad_worry, duration_adj, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  base::print(n = Inf)
# A tibble: 29 × 6
# Groups:   mpwb_sum, phq4_sum, gad_worry, duration_adj [29]
   mpwb_sum phq4_sum gad_worry duration_adj   irl     n
      <dbl>    <dbl>     <dbl>        <dbl> <dbl> <int>
 1       60       20         5         7.66     0     1
 2       60       22         5         6.41     0     1
 3       60       25         6         9.03     0     1
 4       60       28         7         7.66     0     1
 5       61       22         4         9.59     0     1
 6       61       28         7         7.10     0     1
 7       62       20         4         7.8      0     1
 8       62       22         7         8.83     0     1
 9       62       25         5         9.66     0     1
10       62       25         7         6.72     0     1
11       62       26         6         7.79     0     1
12       62       28         7         9.39     0     1
13       62       28         7         9.93     0     1
14       63       20         6         9.17     0     1
15       63       21         4         5.55     0     1
16       63       24         5         6.76     0     1
17       63       24         6         6.59     0     1
18       63       24         6         8.2      0     1
19       63       24         6         9.10     0     1
20       63       25         7         5.90     0     1
21       63       28         7         7.03     0     1
22       64       22         7         8.23     0     1
23       65       20         5         9        0     1
24       65       21         6         5.37     0     1
25       66       21         6         7.83     0     1
26       66       22         6         9.38     0     1
27       70       20         5         6.90     0     1
28       70       20         7         7.93     0     1
29       70       22         4         6.6      0     1
# Sanity check: View combinations of classifications where flag_ls_vs_mpwb is raised
df_flagged |> 
  dplyr::filter(flag_ls_vs_mpwb >= 1) |>
  dplyr::mutate(diff_ls_mpwb = base::abs(life_satisfaction - mpwb_mean)) |>
  dplyr::group_by(life_satisfaction, mpwb_mean, diff_ls_mpwb, flag_ls_vs_mpwb, irl) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "mpwb_mean", width = 500)
# Sanity check: View combinations of classifications where flag_age75_workstudy is raised
df_flagged |> 
  dplyr::filter(flag_age75_workstudy == 1) |>
  dplyr::group_by(employment_cat, age) |> 
  dplyr::summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_cat", width = 800)
# Sanity check: View combinations of classifications where flag_young_advance is raised
df_flagged |> 
  dplyr::filter(flag_young_advance == 1) |>
  dplyr::group_by(education_recoded_cat, age) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 4 × 3
# Groups:   education_recoded_cat [1]
  education_recoded_cat   age     n
  <ord>                 <dbl> <int>
1 Advanced                 18     5
2 Advanced                 19    13
3 Advanced                 20    12
4 Advanced                 21    20
# Sanity check: View combinations of classifications where flag_duration_var is raised
df_flagged |> 
  dplyr::filter(flag_duration_var == 1) |>
  dplyr::mutate(min_duration_adj = min(duration_adj), 
                max_duration_adj = max(duration_adj),
                min_mpwb_var = min(mpwb_var),
                max_mpwb_var = max(mpwb_var)) |>
  dplyr::group_by(min_duration_adj, max_duration_adj, 
                  min_mpwb_var, max_mpwb_var, irl) |> 
  dplyr::summarise(n = dplyr::n())
# A tibble: 1 × 6
# Groups:   min_duration_adj, max_duration_adj, min_mpwb_var, max_mpwb_var [1]
  min_duration_adj max_duration_adj min_mpwb_var max_mpwb_var   irl     n
             <dbl>            <dbl>        <dbl>        <dbl> <dbl> <int>
1                5             9.97            0        0.989     0  6935
# Sanity check: View combinations of classifications where flag_young_rich_alone is raised
df_flagged |> 
  dplyr::filter(flag_young_rich_alone == 1) |>
  dplyr::group_by(household_size, age, income_merg) |> 
  summarise(n = dplyr::n())
# A tibble: 4 × 4
# Groups:   household_size, age [2]
  household_size   age income_merg     n
           <dbl> <dbl>       <dbl> <int>
1              1    18           8     2
2              1    18           9     1
3              1    19           8     3
4              1    19           9     2
# Sanity check: View combinations of classifications where flag_retired_working is raised
df_flagged |> 
  dplyr::filter(flag_retired_working == 1) |>
  dplyr::group_by(employment_orig, age, irl) |> 
  summarise(n = dplyr::n()) |>
  print_reactable(sorted_col = "employment_orig", width = 500)
# Sanity check: View combinations of classifications where flag_weird_nr is raised
df_flagged |> 
  dplyr::filter(flag_weird_nr == 1) |>
  dplyr::group_by(assets_wrd, classification_assets, 
                  debts_wrd, classification_debts, 
                  income_wrd, classification_income, irl) |> 
  dplyr::summarise(n = dplyr::n()) |> 
  base::print(n = Inf)
# A tibble: 25 × 8
# Groups:   assets_wrd, classification_assets, debts_wrd, classification_debts, income_wrd, classification_income [25]
   assets_wrd classification_assets       debts_wrd classification_debts        income_wrd classification_income                irl     n
   <lgl>      <chr>                       <lgl>     <chr>                       <lgl>      <chr>                              <dbl> <int>
 1 FALSE      OK                          FALSE     <NA>                        TRUE       OK                                     0    16
 2 FALSE      OK                          TRUE      OK                          FALSE      OK                                     0     2
 3 FALSE      OK                          TRUE      OK                          FALSE      Very unlikely to be true               0     2
 4 FALSE      OK                          TRUE      OK                          FALSE      <NA>                                   0    19
 5 FALSE      OK                          TRUE      <NA>                        TRUE       OK                                     0     1
 6 FALSE      Very unlikely to be true    FALSE     <NA>                        TRUE       OK                                     0     1
 7 FALSE      Very unlikely to be true    TRUE      OK                          FALSE      <NA>                                   0     2
 8 FALSE      <NA>                        FALSE     OK                          TRUE       OK                                     0     3
 9 FALSE      <NA>                        FALSE     <NA>                        TRUE       OK                                     0     6
10 FALSE      <NA>                        TRUE      OK                          FALSE      OK                                     0     1
11 FALSE      <NA>                        TRUE      OK                          FALSE      <NA>                                   0    13
12 TRUE       Not possible/not believable TRUE      Not possible/not believable TRUE       OK                                     0     1
13 TRUE       OK                          FALSE     Not possible/not believable FALSE      <NA>                                   0     3
14 TRUE       OK                          FALSE     OK                          FALSE      Change to: [add value on column F]     0     1
15 TRUE       OK                          FALSE     OK                          FALSE      OK                                     0     1
16 TRUE       OK                          FALSE     OK                          FALSE      <NA>                                   0     6
17 TRUE       OK                          FALSE     Very unlikely to be true    FALSE      <NA>                                   0     7
18 TRUE       OK                          FALSE     <NA>                        FALSE      Change to: [add value on column F]     0     1
19 TRUE       OK                          FALSE     <NA>                        FALSE      OK                                     0     1
20 TRUE       OK                          FALSE     <NA>                        FALSE      Very unlikely to be true               0     1
21 TRUE       OK                          FALSE     <NA>                        FALSE      <NA>                                   0    31
22 TRUE       OK                          TRUE      Cannot determine            FALSE      <NA>                                   0     1
23 TRUE       OK                          TRUE      OK                          FALSE      <NA>                                   0     1
24 TRUE       OK                          TRUE      Very unlikely to be true    FALSE      Very unlikely to be true               0     1
25 TRUE       Very unlikely to be true    TRUE      OK                          FALSE      <NA>                                   0     2
# Apply reviewed financial variables, unless all three flags F4, F15, and F2 are raised
df_flagged <- df_flagged |>
  dplyr::mutate(
    assets_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_assets) &
        !classification_assets %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ assets_reviewed
    ),
    debts_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_debts) &
        !classification_debts %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ debts_reviewed
    ),
    income_text_reviewed = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ income_text_reviewed
    ),
    income_merg = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_real_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_real_,

      TRUE ~ income_merg
    ),
    income_merg_translated = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_translated
    ),
    income_merg_group = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_group
    ),
    income_merg_cat = dplyr::case_when(
      flag_assets_debts_same == 1 &
      flag_weird_nr == 1 &
      flag_fin_invalid == 1 ~ NA_character_,

      !is.na(classification_income) &
        !classification_income %in% c("OK", "Change to: [add value on column F]")
      ~ NA_character_,

      TRUE ~ income_merg_cat
    ),
    
    income_merg_cat = base::factor(
      income_merg_cat,
      levels = c(
        "No income",
        "Second decile",
        "Third decile",
        "Fourth decile",
        "Fifth decile",
        "Sixth decile",
        "Seventh decile",
        "Eighth decile",
        "Ninth decile",
        "Tenth decile"
      ),
      ordered = TRUE
    ),
    
    income_merg_group = base::factor(
      income_merg_group,
      levels = c(
        "No income",
        "Low",
        "Mid",
        "Upper",
        "Wealthiest"
      ),
      ordered = TRUE
    )
  )

# Sanity check:
df_flagged |> 
  dplyr::filter(is.na(income_text_reviewed) & !is.na(income_merg) & income_orig == 10) |>
  base::nrow()
[1] 0
df_flagged |> 
  dplyr::filter(
      is.na(income_text_reviewed) &
      income_orig ==10 & 
      classification_income %in% c("OK", "Change to: [remove value]")) |> 
  base::nrow()
[1] 0
df_flagged |> 
  dplyr::filter(
      is.na(income_text_reviewed) &
      income_orig == 10 & 
      is.na(classification_income)) |> 
  base::nrow()
[1] 0
# Create final data frame with only "passed" participants after flagging
df_final <- df_flagged |> 
  dplyr::filter(valid_status == "passed") |>
  dplyr::select(-valid_status, -dplyr::starts_with("flag_"))

# Cleanup
rm(botnet_ids, massive_rep_ids, income_gaps, income_info)

Exclusion Summary

# Combine direct exclusions and flags
df_exclusion <- df_merged |>
  dplyr::left_join(
    df_flagged |>
      dplyr::select(ResponseId, exclusion_flags),
    by = "ResponseId"
  ) |>
  dplyr::mutate(
    exclusion_criteria = base::factor(
      dplyr::case_when(
      valid_status %in% c("incomplete","not residents") ~ valid_status,
      valid_status %in% c(
        "implausible working with no income",
        "implausible retired young",
        "implausible extremes",
        "implausible high scores with speed"
      ) ~ "implausible",
      valid_status %in% c("too fast general","too fast low var") ~ "too fast",
      valid_status == "passed" & 
        !is.na(exclusion_flags) & exclusion_flags == 1 ~ "flagged",
      
      valid_status == "passed" & 
        (is.na(exclusion_flags) | exclusion_flags == 0) ~ "valid",
      
      TRUE ~ NA_character_
    ),
      levels = c(
        "valid",
        "incomplete",
        "too fast",
        "not residents",
        "implausible",
        "flagged"
      )
    )
  )

# Country-level summary
summary_table <- df_exclusion |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    initial_number_of_participants = dplyr::n(),
    valid_participants = base::sum(exclusion_criteria == "valid", na.rm = TRUE),
    incomplete = base::sum(exclusion_criteria == "incomplete", na.rm = TRUE),
    too_fast = base::sum(exclusion_criteria == "too fast", na.rm = TRUE),
    not_residents = base::sum(exclusion_criteria == "not residents", na.rm = TRUE),
    implausible = base::sum(exclusion_criteria == "implausible", na.rm = TRUE),
    flagged = base::sum(exclusion_criteria == "flagged", na.rm = TRUE)
  ) |>
  dplyr::mutate(
    total_exclusions =
      initial_number_of_participants - valid_participants,
    total_pct_lost =
      (total_exclusions / initial_number_of_participants) * 100
  )

# Total row
total_row <- summary_table |>
  dplyr::summarise(
    country = "Total",
    initial_number_of_participants =
      base::sum(initial_number_of_participants),
    valid_participants = base::sum(valid_participants),
    incomplete = base::sum(incomplete),
    too_fast = base::sum(too_fast),
    not_residents = base::sum(not_residents),
    implausible = base::sum(implausible),
    flagged = base::sum(flagged),
    total_exclusions = base::sum(total_exclusions),
    total_pct_lost =
      (total_exclusions / initial_number_of_participants) * 100
  )

summary_table_pct <- dplyr::bind_rows(summary_table, total_row) |>
  dplyr::mutate(
    incomplete = paste0(
      incomplete,
      " (",
      format(round((incomplete / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    too_fast = paste0(
      too_fast,
      " (",
      format(round((too_fast / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    not_residents = paste0(
      not_residents,
      " (",
      format(round((not_residents / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    implausible = paste0(
      implausible,
      " (",
      format(round((implausible / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    flagged = paste0(
      flagged,
      " (",
      format(round((flagged / initial_number_of_participants) * 100, 2), nsmall = 2),
      "%)"
    ),
    total_exclusions = paste0(
      total_exclusions,
      " (",
      format(round(total_pct_lost, 2), nsmall = 1),
      "%)"
    )
  ) |>
  dplyr::select(-total_pct_lost)

# gt
summary_table_pdf <- summary_table_pct |>
  dplyr::rename(
    Country = country,
    "Initial number of participants" = initial_number_of_participants,
    "Valid participants" = valid_participants,
    "Incomplete" = incomplete,
    "Too fast" = too_fast,
    "Not residents" = not_residents,
    "Implausible combinations" = implausible,
    "Flagged" = flagged,
    "Total exclusions" = total_exclusions
  )

gt_table <- summary_table_pdf |>
  gt::gt() |>
  gt::cols_width(
    Country ~ gt::px(65),
    `Initial number of participants` ~ gt::px(70),
    `Valid participants` ~ gt::px(70),
    Incomplete ~ gt::px(70),
    `Too fast` ~ gt::px(70),
    `Not residents` ~ gt::px(70),
    `Implausible combinations` ~ gt::px(70),
    `Flagged` ~ gt::px(70),
    `Total exclusions` ~ gt::px(105)
  ) |>
  gt::tab_options(
    table.font.size = 10,
    column_labels.font.size = 11,
    table.background.color = "white",
    table.align = "center",
    table.width = gt::px(650),
    table.border.top.color = "white",
    table.border.bottom.color = "white",
    table.border.left.color = "white",
    table.border.right.color = "white",
    table_body.hlines.color = "black",
    table_body.vlines.color = "white",
    column_labels.vlines.color = "white",
    column_labels.border.top.color = "white",
    column_labels.border.bottom.color = "black"
  ) |>
  gt::opt_table_lines() |>
  gt::tab_style(
    style = list(
      gt::cell_text(
        weight = "bold",
        align = "center"
      )
    ),
    locations = gt::cells_column_labels(gt::everything())
  ) |>
  gt::tab_style(
    style = gt::cell_text(
      align = "left"
    ),
    locations = gt::cells_body(columns = Country)
  ) |>
  gt::tab_style(
    style = gt::cell_text(
      align = "center"
    ),
    locations = gt::cells_body(
      columns = c(
        `Initial number of participants`,
        `Valid participants`,
        Incomplete,
        `Too fast`,
        `Not residents`,
        `Implausible combinations`,
        `Flagged`,
        `Total exclusions`
      )
    )
  ); gt_table
Country Initial number of participants Valid participants Incomplete Too fast Not residents Implausible combinations Flagged Total exclusions
Albania 2284 1758 487 (21.32%) 4 ( 0.18%) 34 (1.49%) 1 (0.04%) 0 (0.00%) 526 (23.03%)
Algeria 203 149 53 (26.11%) 1 ( 0.49%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 54 (26.60%)
Angola 329 240 68 (20.67%) 15 ( 4.56%) 1 (0.30%) 5 (1.52%) 0 (0.00%) 89 (27.05%)
Argentina 769 634 119 (15.47%) 3 ( 0.39%) 11 (1.43%) 2 (0.26%) 0 (0.00%) 135 (17.56%)
Armenia 334 246 83 (24.85%) 1 ( 0.30%) 1 (0.30%) 3 (0.90%) 0 (0.00%) 88 (26.35%)
Australia 605 500 67 (11.07%) 25 ( 4.13%) 7 (1.16%) 5 (0.83%) 1 (0.17%) 105 (17.36%)
Austria 685 570 106 (15.47%) 8 ( 1.17%) 1 (0.15%) 0 (0.00%) 0 (0.00%) 115 (16.79%)
Bahrain 211 161 48 (22.75%) 1 ( 0.47%) 1 (0.47%) 0 (0.00%) 0 (0.00%) 50 (23.70%)
Bangladesh 536 335 186 (34.70%) 0 ( 0.00%) 0 (0.00%) 14 (2.61%) 1 (0.19%) 201 (37.50%)
Belgium 331 272 42 (12.69%) 14 ( 4.23%) 2 (0.60%) 1 (0.30%) 0 (0.00%) 59 (17.82%)
Bolivia 341 279 59 (17.30%) 0 ( 0.00%) 2 (0.59%) 1 (0.29%) 0 (0.00%) 62 (18.18%)
Bosnia-Herzegovina 642 486 144 (22.43%) 11 ( 1.71%) 1 (0.16%) 0 (0.00%) 0 (0.00%) 156 (24.30%)
Brazil 2094 1809 241 (11.51%) 30 ( 1.43%) 1 (0.05%) 13 (0.62%) 0 (0.00%) 285 (13.61%)
Bulgaria 393 324 67 (17.05%) 1 ( 0.25%) 0 (0.00%) 1 (0.25%) 0 (0.00%) 69 (17.56%)
Canada 874 707 148 (16.93%) 16 ( 1.83%) 1 (0.11%) 2 (0.23%) 0 (0.00%) 167 (19.11%)
Chad 192 115 73 (38.02%) 0 ( 0.00%) 2 (1.04%) 2 (1.04%) 0 (0.00%) 77 (40.10%)
Chile 240 207 30 (12.50%) 3 ( 1.25%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 33 (13.75%)
China 2523 1018 215 ( 8.52%) 1277 (50.61%) 0 (0.00%) 13 (0.52%) 0 (0.00%) 1505 (59.65%)
Croatia 455 349 99 (21.76%) 7 ( 1.54%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 106 (23.30%)
Cyprus 218 173 41 (18.81%) 0 ( 0.00%) 0 (0.00%) 4 (1.83%) 0 (0.00%) 45 (20.64%)
Czech Republic 267 202 63 (23.60%) 1 ( 0.37%) 1 (0.37%) 0 (0.00%) 0 (0.00%) 65 (24.34%)
Denmark 338 283 47 (13.91%) 7 ( 2.07%) 0 (0.00%) 1 (0.30%) 0 (0.00%) 55 (16.27%)
Ecuador 1075 954 94 ( 8.74%) 7 ( 0.65%) 11 (1.02%) 9 (0.84%) 0 (0.00%) 121 (11.26%)
Egypt 869 630 221 (25.43%) 5 ( 0.58%) 9 (1.04%) 4 (0.46%) 0 (0.00%) 239 (27.50%)
Estonia 2402 1903 480 (19.98%) 4 ( 0.17%) 3 (0.12%) 12 (0.50%) 0 (0.00%) 499 (20.77%)
Ethiopia 552 403 141 (25.54%) 1 ( 0.18%) 4 (0.72%) 3 (0.54%) 0 (0.00%) 149 (26.99%)
Finland 275 241 24 ( 8.73%) 7 ( 2.55%) 1 (0.36%) 2 (0.73%) 0 (0.00%) 34 (12.36%)
France 1175 908 153 (13.02%) 90 ( 7.66%) 2 (0.17%) 21 (1.79%) 1 (0.09%) 267 (22.72%)
Georgia 504 371 126 (25.00%) 6 ( 1.19%) 1 (0.20%) 0 (0.00%) 0 (0.00%) 133 (26.39%)
Germany 1008 824 156 (15.48%) 21 ( 2.08%) 1 (0.10%) 5 (0.50%) 1 (0.10%) 184 (18.25%)
Greece 532 444 81 (15.23%) 3 ( 0.56%) 2 (0.38%) 2 (0.38%) 0 (0.00%) 88 (16.54%)
Hong Kong 237 176 45 (18.99%) 6 ( 2.53%) 10 (4.22%) 0 (0.00%) 0 (0.00%) 61 (25.74%)
Hungary 735 555 169 (22.99%) 5 ( 0.68%) 3 (0.41%) 3 (0.41%) 0 (0.00%) 180 (24.49%)
India 1627 1225 315 (19.36%) 26 ( 1.60%) 21 (1.29%) 34 (2.09%) 6 (0.37%) 402 (24.71%)
Indonesia 1501 1223 250 (16.66%) 13 ( 0.87%) 3 (0.20%) 8 (0.53%) 4 (0.27%) 278 (18.52%)
Iran 292 216 75 (25.68%) 0 ( 0.00%) 0 (0.00%) 1 (0.34%) 0 (0.00%) 76 (26.03%)
Ireland 1661 1526 110 ( 6.62%) 10 ( 0.60%) 12 (0.72%) 3 (0.18%) 0 (0.00%) 135 ( 8.13%)
Israel 437 353 75 (17.16%) 8 ( 1.83%) 0 (0.00%) 1 (0.23%) 0 (0.00%) 84 (19.22%)
Italy 566 489 66 (11.66%) 7 ( 1.24%) 1 (0.18%) 3 (0.53%) 0 (0.00%) 77 (13.60%)
Japan 549 431 76 (13.84%) 36 ( 6.56%) 0 (0.00%) 5 (0.91%) 1 (0.18%) 118 (21.49%)
Kazakhstan 787 676 91 (11.56%) 11 ( 1.40%) 8 (1.02%) 1 (0.13%) 0 (0.00%) 111 (14.10%)
Kosovo 1373 994 359 (26.15%) 4 ( 0.29%) 12 (0.87%) 4 (0.29%) 0 (0.00%) 379 (27.60%)
Kuwait 315 241 69 (21.90%) 2 ( 0.63%) 2 (0.63%) 1 (0.32%) 0 (0.00%) 74 (23.49%)
Kyrgyzstan 375 274 74 (19.73%) 22 ( 5.87%) 1 (0.27%) 4 (1.07%) 0 (0.00%) 101 (26.93%)
Latvia 1023 806 206 (20.14%) 6 ( 0.59%) 2 (0.20%) 3 (0.29%) 0 (0.00%) 217 (21.21%)
Lebanon 416 322 84 (20.19%) 0 ( 0.00%) 3 (0.72%) 7 (1.68%) 0 (0.00%) 94 (22.60%)
Madagascar 169 145 22 (13.02%) 0 ( 0.00%) 0 (0.00%) 2 (1.18%) 0 (0.00%) 24 (14.20%)
Malaysia 816 706 99 (12.13%) 2 ( 0.25%) 1 (0.12%) 7 (0.86%) 1 (0.12%) 110 (13.48%)
Mexico 1164 1062 84 ( 7.22%) 10 ( 0.86%) 1 (0.09%) 6 (0.52%) 1 (0.09%) 102 ( 8.76%)
Moldova 511 398 100 (19.57%) 3 ( 0.59%) 4 (0.78%) 5 (0.98%) 1 (0.20%) 113 (22.11%)
Mongolia 367 261 100 (27.25%) 0 ( 0.00%) 6 (1.63%) 0 (0.00%) 0 (0.00%) 106 (28.88%)
Montenegro 358 301 45 (12.57%) 4 ( 1.12%) 7 (1.96%) 1 (0.28%) 0 (0.00%) 57 (15.92%)
Morocco 302 231 61 (20.20%) 3 ( 0.99%) 1 (0.33%) 6 (1.99%) 0 (0.00%) 71 (23.51%)
Mozambique 154 122 31 (20.13%) 0 ( 0.00%) 0 (0.00%) 1 (0.65%) 0 (0.00%) 32 (20.78%)
Netherlands 448 353 79 (17.63%) 14 ( 3.12%) 1 (0.22%) 1 (0.22%) 0 (0.00%) 95 (21.21%)
Nigeria 721 636 75 (10.40%) 0 ( 0.00%) 8 (1.11%) 2 (0.28%) 0 (0.00%) 85 (11.79%)
North Macedonia 268 230 37 (13.81%) 0 ( 0.00%) 1 (0.37%) 0 (0.00%) 0 (0.00%) 38 (14.18%)
Norway 509 408 90 (17.68%) 11 ( 2.16%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 101 (19.84%)
Oman 520 413 100 (19.23%) 1 ( 0.19%) 3 (0.58%) 2 (0.38%) 1 (0.19%) 107 (20.58%)
Pakistan 507 401 94 (18.54%) 2 ( 0.39%) 1 (0.20%) 9 (1.78%) 0 (0.00%) 106 (20.91%)
Paraguay 205 162 42 (20.49%) 1 ( 0.49%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 43 (20.98%)
Peru 1031 868 87 ( 8.44%) 48 ( 4.66%) 4 (0.39%) 23 (2.23%) 1 (0.10%) 163 (15.81%)
Philippines 3556 2636 769 (21.63%) 17 ( 0.48%) 120 (3.37%) 12 (0.34%) 2 (0.06%) 920 (25.87%)
Poland 1288 1024 250 (19.41%) 8 ( 0.62%) 3 (0.23%) 3 (0.23%) 0 (0.00%) 264 (20.50%)
Portugal 579 451 118 (20.38%) 7 ( 1.21%) 0 (0.00%) 3 (0.52%) 0 (0.00%) 128 (22.11%)
Qatar 526 397 113 (21.48%) 5 ( 0.95%) 11 (2.09%) 0 (0.00%) 0 (0.00%) 129 (24.52%)
Republic of Korea 492 425 27 ( 5.49%) 34 ( 6.91%) 1 (0.20%) 5 (1.02%) 0 (0.00%) 67 (13.62%)
Romania 861 676 174 (20.21%) 10 ( 1.16%) 0 (0.00%) 1 (0.12%) 0 (0.00%) 185 (21.49%)
Russia 1322 1168 73 ( 5.52%) 51 ( 3.86%) 17 (1.29%) 12 (0.91%) 1 (0.08%) 154 (11.65%)
Saudi Arabia 296 260 27 ( 9.12%) 6 ( 2.03%) 1 (0.34%) 2 (0.68%) 0 (0.00%) 36 (12.16%)
Senegal 211 142 66 (31.28%) 0 ( 0.00%) 1 (0.47%) 2 (0.95%) 0 (0.00%) 69 (32.70%)
Serbia 420 324 89 (21.19%) 4 ( 0.95%) 3 (0.71%) 0 (0.00%) 0 (0.00%) 96 (22.86%)
Singapore 298 239 19 ( 6.38%) 39 (13.09%) 0 (0.00%) 1 (0.34%) 0 (0.00%) 59 (19.80%)
Slovakia 724 517 196 (27.07%) 3 ( 0.41%) 5 (0.69%) 3 (0.41%) 0 (0.00%) 207 (28.59%)
Slovenia 746 584 154 (20.64%) 5 ( 0.67%) 1 (0.13%) 2 (0.27%) 0 (0.00%) 162 (21.72%)
South Africa 279 233 45 (16.13%) 0 ( 0.00%) 0 (0.00%) 1 (0.36%) 0 (0.00%) 46 (16.49%)
Spain 729 614 104 (14.27%) 6 ( 0.82%) 3 (0.41%) 2 (0.27%) 0 (0.00%) 115 (15.78%)
Sweden 1149 824 266 (23.15%) 53 ( 4.61%) 0 (0.00%) 6 (0.52%) 0 (0.00%) 325 (28.29%)
Switzerland 823 668 139 (16.89%) 14 ( 1.70%) 0 (0.00%) 2 (0.24%) 0 (0.00%) 155 (18.83%)
Taiwan 201 146 36 (17.91%) 17 ( 8.46%) 0 (0.00%) 2 (1.00%) 0 (0.00%) 55 (27.36%)
Thailand 440 375 59 (13.41%) 4 ( 0.91%) 0 (0.00%) 2 (0.45%) 0 (0.00%) 65 (14.77%)
Timor-Leste 277 144 129 (46.57%) 0 ( 0.00%) 1 (0.36%) 3 (1.08%) 0 (0.00%) 133 (48.01%)
Türkiye 682 487 171 (25.07%) 11 ( 1.61%) 12 (1.76%) 1 (0.15%) 0 (0.00%) 195 (28.59%)
UAE 336 228 86 (25.60%) 6 ( 1.79%) 13 (3.87%) 3 (0.89%) 0 (0.00%) 108 (32.14%)
UK 852 671 147 (17.25%) 26 ( 3.05%) 5 (0.59%) 3 (0.35%) 0 (0.00%) 181 (21.24%)
USA 5708 4242 1002 (17.55%) 170 ( 2.98%) 280 (4.91%) 11 (0.19%) 3 (0.05%) 1466 (25.68%)
Uganda 332 242 85 (25.60%) 0 ( 0.00%) 1 (0.30%) 4 (1.20%) 0 (0.00%) 90 (27.11%)
Ukraine 749 654 88 (11.75%) 3 ( 0.40%) 3 (0.40%) 1 (0.13%) 0 (0.00%) 95 (12.68%)
Uruguay 815 566 246 (30.18%) 1 ( 0.12%) 2 (0.25%) 0 (0.00%) 0 (0.00%) 249 (30.55%)
Uzbekistan 662 556 102 (15.41%) 4 ( 0.60%) 0 (0.00%) 0 (0.00%) 0 (0.00%) 106 (16.01%)
Yemen 580 397 170 (29.31%) 1 ( 0.17%) 5 (0.86%) 7 (1.21%) 0 (0.00%) 183 (31.55%)
Zimbabwe 275 210 59 (21.45%) 3 ( 1.09%) 1 (0.36%) 2 (0.73%) 0 (0.00%) 65 (23.64%)
Total 69408 53799 12181 (17.55%) 2332 ( 3.36%) 705 (1.02%) 365 (0.53%) 26 (0.04%) 15609 (22.49%)
gt::gtsave(gt_table, "222_exclusion_table.html")
gt::gtsave(gt_table, "222_exclusion_table.docx")
pagedown::chrome_print(
  "222_exclusion_table.html",
  output = "222_exclusion_table.pdf"
)

# Country-level inclusion and sample sizes
country_inclusion <- df_exclusion |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    initial_number_of_participants = dplyr::n(),
    valid_participants = sum(exclusion_criteria == "valid", na.rm = TRUE),
    inclusion_rate = format(round(100 * valid_participants /
      initial_number_of_participants, 2), nsmall = 2)
  )

# Country with minimum and maximum inclusion rates
country_inclusion |>
  dplyr::slice_min(inclusion_rate, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 China                             2523               1018 40.35         
country_inclusion |>
  dplyr::slice_max(inclusion_rate, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 Ireland                           1661               1526 91.87         
# Countries with smallest and largest valid sample sizes
country_inclusion |>
  dplyr::slice_min(valid_participants, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 Chad                               192                115 59.90         
country_inclusion |>
  dplyr::slice_max(valid_participants, n = 1, with_ties = FALSE)
# A tibble: 1 × 4
  country initial_number_of_participants valid_participants inclusion_rate
  <chr>                            <int>              <int> <chr>         
1 USA                               5708               4242 74.32         
# Cleanup
rm(df_exclusion, gt_table, summary_table, summary_table_pct, summary_table_pdf, total_row)

Missing Data

# Focus on the original variables
orig_cols <- base::intersect(names(df_final), names(df_pub))

df_final_orig <- df_final |>
  dplyr::select(all_of(orig_cols))

visdat::vis_miss(df_final_orig, cluster = TRUE, warn_large_data = FALSE)

df_final_orig |>
  dplyr::summarise(
    dplyr::across(
      dplyr::everything(),
      \(x) 100 * mean(is.na(x), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  tidyr::pivot_longer(
    cols = dplyr::everything(),
    names_to = "col",
    values_to = "pct_missing"
  ) |>
  print_reactable(sorted_col = "pct_missing", width = 600)
# Cleanup
rm(orig_cols, df_final_orig)

A0.3. Harmonize financial variables

Midpoint of the brackets

The midpoints of the brackets were computed as (low_bracket + high_bracket)/2 except the last bracket needs to be computed differently because it is open-ended. We computed the median ratio between the midpoints of the last bracket and the low point of the last bracket across countries where the low point of the last bracket was lower than the maximum open text answer provided by participants. This median ratio was then used to compute the midpoint of the last bracket in countries where the low point of the last bracket was higher than the maximum open text answer provided by participants.

# Calculate midpoints for all brackets except the last one
income_brackets <- income_recoded |>
  dplyr::mutate(
    income_midpoint = dplyr::case_when(
      income_orig == 9 ~ NA_real_,
      TRUE ~ (income_lowpoint + income_highpoint) / 2
    )
  ) |> 
  dplyr::select(UserLanguage, income_orig, income_midpoint) |>
  dplyr::glimpse(width = 100)
Rows: 1,125
Columns: 3
$ UserLanguage    <chr> "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB", "SQI-ALB…
$ income_orig     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7,…
$ income_midpoint <dbl> 60000.0, 160000.0, 250000.0, 355000.0, 515000.0, 725000.0, 1040000.0, 1625…
nrow(df_final)
[1] 53799
df_final <- df_final |>
  dplyr::left_join(income_brackets, by = c("UserLanguage", "income_orig"))

nrow(df_final)
[1] 53799
# Calculate midpoints for the last bracket
midpoints_last <- df_final |>
  # There are no income values from the open text field in Taiwan
  dplyr::filter(country != "Taiwan") |>
  dplyr::group_by(country) |>
  dplyr::summarise(
    # Ireland have different lowpoints for the last bracket
    # in the main dataset and the sponsored dataset.
    max_income_lowpoint = base::max(income_lowpoint_9, na.rm = TRUE),
    max_income_text = base::max(income_text_reviewed, na.rm = TRUE),
    income_midpoint_last =
      base::mean(c(max_income_lowpoint, max_income_text), na.rm = TRUE),
    bracket_higher_than_text =
      !is.na(max_income_text) & max_income_lowpoint > max_income_text,
    ratio = income_midpoint_last/max_income_lowpoint
  ) |> 
  dplyr::glimpse(width = 100)
Rows: 91
Columns: 6
$ country                  <chr> "Albania", "Algeria", "Angola", "Argentina", "Armenia", "Australi…
$ max_income_lowpoint      <dbl> 2000000, 200000, 1450000001, 2400000, 1200001, 250000, 93163, 230…
$ max_income_text          <dbl> 2000000, 400000, 200000000, 40000000, 4000000, 290000, 200000, 55…
$ income_midpoint_last     <dbl> 2000000.0, 300000.0, 825000000.5, 21200000.0, 2600000.5, 270000.0…
$ bracket_higher_than_text <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ ratio                    <dbl> 1.0000000, 1.5000000, 0.5689655, 8.8333333, 2.1666653, 1.0800000,…
ratio_last_bracket <- midpoints_last |>
  dplyr::filter(!bracket_higher_than_text) |>
  dplyr::summarise(median_ratio = stats::median(ratio, na.rm = TRUE)) |>
  dplyr::pull(median_ratio); ratio_last_bracket
[1] 2
nrow(df_final)
[1] 53799
df_final <- df_final |>
  dplyr::left_join(midpoints_last |> dplyr::select(country, bracket_higher_than_text, max_income_lowpoint, income_midpoint_last), by = "country")

nrow(df_final)
[1] 53799
# Update income_midpoint for the last bracket
df_final <-
  df_final |>
  dplyr::mutate(
    income_midpoint = dplyr::case_when(
      income_orig == 9 & !bracket_higher_than_text ~ income_midpoint_last,
      income_orig == 9 &  bracket_higher_than_text ~ max_income_lowpoint * ratio_last_bracket,
      income_orig == 9 & country == "Taiwan" 
       ~ income_lowpoint_9 * ratio_last_bracket,
      TRUE ~ income_midpoint
    )
  )

# Sanity check: View last midpoints
df_final |>
  dplyr::filter(income_orig == 9) |>
  dplyr::group_by(country, income_orig, income_lowpoint_9, income_midpoint) |>
  dplyr::summarise() |>
  dplyr::arrange(country) |>
  print_reactable(sorted_col = "income_midpoint", width = 800)
# Cleanup
rm(income_recoded, income_brackets, midpoints_last, ratio_last_bracket)

Convert monthly values to annual

We created a variable that combined midpoints from brackets with the open field answers and converted monthly values to annual values.

The survey versions for Bahrain and Pakistan requested for annual income in their native language, but monthly in the English version. If the distribution of responses is similar, we will retain the transformed monthly values.

table(df_final$income_period, useNA = "always")

 annual monthly    <NA> 
  28802   24997       0 
table(df_final$wages_per_year, df_final$income_period, useNA = "always")
      
       annual monthly  <NA>
  12        0   21935     0
  12.5      0     241     0
  13        0    2821     0
  <NA>  28802       0     0
df_final <- df_final |>
  dplyr::mutate(

    income_cont = dplyr::case_when(
      income_orig == 0 ~ 0,
      income_orig == 10 ~ income_text_reviewed,
      !is.na(income_midpoint) ~ income_midpoint,
      TRUE ~ NA_real_
    ),

    income_cont_nozero = dplyr::case_when(
      income_orig == 10 & income_text_reviewed > 0 ~ income_text_reviewed,
      !is.na(income_midpoint) ~ income_midpoint,
      TRUE ~ NA_real_
    ),

    income_annual = dplyr::case_when(
      income_period == "monthly" ~ income_cont_nozero * wages_per_year,
      TRUE ~ income_cont_nozero
    )
  )

df_final |> dplyr::group_by(country, income_orig, income_merg, income_text_reviewed,
                            income_merg_translated, income_cont, income_midpoint,
                            income_cont_nozero, income_annual, income_period) |>
  dplyr::summarise(n = dplyr::n()) |>
  dplyr::arrange(country, income_orig) |>
  print_reactable(sorted_col = "country", width = 900)

Convert 2025 income values to 2024

The inflation2024_factor was calculated with Consumer Price Index (CPI): CPI 2025 / CPI 2024

If national reports only provided the percentage of inflation change, then the factor was calculated as 1 + (percentage inflation change / 100). Then, to convert 2025 income to 2024 values we divided 2025 income by this factor.

table(df_final$income_year, useNA = "always")

 2024  2025  <NA> 
50097  3702     0 
table(df_final$inflation2024_factor, useNA = "always")

 0.966  1.001 1.0021 1.0182 1.0399 1.0415  1.045  1.046  1.073  1.087  1.118   <NA> 
   115    293    241    500    371    122    230    324    274    556    676  50097 
df_final <- df_final |>
  dplyr::mutate(
    income_annual_24 = dplyr::case_when(
      income_year == 2025 ~ income_annual / inflation2024_factor,
      TRUE ~ income_annual
    )
  )

Income net and gross

Collaborators from Kuwait, Oman, and Saudi Arabia confirmed that the income values do not require transformation because there is no income tax in these countries.

The survey version for Belgium requested net income in the Dutch version, and gross income in the French version.

Since the calculation of social contribution and tax deduction is not the same for all countries, we will remove the countries that asked for net income.

We will not apply tax brackets for Zimbabwe, since the system changes throughout the year due to inflation.

df_final <- df_final |>
  dplyr::mutate(
    income_annual_24_gross = dplyr::case_when(
      income_type == "net" ~ NA_real_,
      TRUE ~ income_annual_24
    )
  )

Converting all financial values to USD

df_final <- df_final |>
  dplyr::mutate(
    
    # We have already set values of 0 as NA earlier
    income_annual_24_gross_USD = income_annual_24_gross *
      one_local_unit_to_USD_conversion,

    # If assets or debts are 0, set to NA
    assets_USD = base::ifelse(
      is.na(assets_reviewed) | assets_reviewed == 0,
      NA_real_,
      assets_reviewed * one_local_unit_to_USD_conversion
    ),
    debts_USD = base::ifelse(
      is.na(debts_reviewed) | debts_reviewed == 0,
      NA_real_,
      debts_reviewed * one_local_unit_to_USD_conversion
    )
  )

Calculating z-scores and percentiles for assets and debts

# Sanity check: View means and sds
df_final |>
  dplyr::group_by(country, income_type) |>
  dplyr::summarise(
    n_income = base::round(base::sum(!is.na(income_annual_24_gross_USD)), 2),
    mean_income = base::round(base::mean(income_annual_24_gross_USD, na.rm = TRUE), 2),
    sd_income = base::round(stats::sd(income_annual_24_gross_USD, na.rm = TRUE), 2),

    n_assets = base::round(base::sum(!is.na(assets_USD)), 2),
    mean_assets = base::round(base::mean(assets_USD, na.rm = TRUE), 2),
    sd_assets = base::round(stats::sd(assets_USD, na.rm = TRUE), 2),

    n_debts = base::round(base::sum(!is.na(debts_USD)), 2),
    mean_debts = base::round(base::mean(debts_USD, na.rm = TRUE), 2),
    sd_debts = base::round(stats::sd(debts_USD, na.rm = TRUE), 2)
  )  |>
  print_reactable(sorted_col = "country", width = 900)
df_final <- df_final |>
  dplyr::group_by(country) |>
  dplyr::mutate(
    income_USD_z_local = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      (income_annual_24_gross_USD - 
         base::mean(income_annual_24_gross_USD, na.rm = TRUE))
      / stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
      NA_real_
    ),
    
    assets_USD_z_local = base::ifelse(
      !is.na(assets_USD),
      (assets_USD - base::mean(assets_USD, na.rm = TRUE))
      / stats::sd(assets_USD, na.rm = TRUE),
      NA_real_
    ),
    
    debts_USD_z_local = base::ifelse(
      !is.na(debts_USD),
      (debts_USD - base::mean(debts_USD, na.rm = TRUE))
      / stats::sd(debts_USD, na.rm = TRUE),
      NA_real_
    ),
    
    income_USD_percentile_local = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      dplyr::percent_rank(income_annual_24_gross_USD),
      NA_real_
    ),
    
    assets_USD_percentile_local = base::ifelse(
      !is.na(assets_USD),
      dplyr::percent_rank(assets_USD),
      NA_real_
    ),
      
    debts_USD_percentile_local = base::ifelse(
      !is.na(debts_USD),
      dplyr::percent_rank(debts_USD),
      NA_real_
    )
  ) |>
  dplyr::ungroup() |>
  dplyr::mutate(
    income_USD_z_full = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      (income_annual_24_gross_USD - 
         base::mean(income_annual_24_gross_USD, na.rm = TRUE))
      / stats::sd(income_annual_24_gross_USD, na.rm = TRUE),
      NA_real_
    ),
    
    assets_USD_z_full = base::ifelse(
    !is.na(assets_USD),
    (assets_USD - base::mean(assets_USD, na.rm = TRUE)) 
    / stats::sd(assets_USD, na.rm = TRUE),
      NA_real_
    ),
    
    debts_USD_z_full = base::ifelse(
    !is.na(debts_USD),
    (debts_USD  - base::mean(debts_USD, na.rm = TRUE)) 
    / stats::sd(debts_USD, na.rm = TRUE),
      NA_real_
    ),
    
    income_USD_percentile_full = base::ifelse(
      !is.na(income_annual_24_gross_USD),
      dplyr::percent_rank(income_annual_24_gross_USD),
      NA_real_
    ),
    
    assets_USD_percentile_full = base::ifelse(
    !is.na(assets_USD),
    dplyr::percent_rank(assets_USD),
      NA_real_
    ),
    
    debts_USD_percentile_full = base::ifelse(
    !is.na(debts_USD),
    dplyr::percent_rank(debts_USD),
      NA_real_
    )
  )


(df_gmh |> filter(is.na(income_cont)) |> nrow()) + 
  (df_gmh |> filter(income_cont == 0) |> nrow())
[1] 2664
df_gmh |> filter(is.na(income_annual)) |> nrow()
[1] 2664
df_gmh |> filter(is.na(income_annual_24)) |> nrow()
[1] 2664
(df_gmh |> filter(is.na(income_annual_24) & income_type != "net") |> nrow()) + 
  (df_gmh |> filter(income_type == "net") |> nrow())
[1] 13180
df_gmh |> filter(is.na(income_annual_24_gross)) |> nrow()
[1] 13180
df_gmh |> filter(is.na(income_annual_24_gross_USD)) |> nrow()
[1] 13180

A0.4. Weights data

For Moldova, Romania, Nigeria, Montenegro, Angola, Morocco, Uruguay, Paraguay, Greece, Iran, Hungary, Kosovo, Yemen, Chile, and Uganda, values of 1 were used instead of weighted scores.

# Load weights computed based on age, education, sex, and country
weights <- base::readRDS("444_weighted_data.RDS")

# Sanity check: View participants without weights due to missing sociodemographics
weights |>
  dplyr::filter(is.na(ps_weight), !is.na(education_recoded_cat)) |>
  dplyr::select(ResponseId, country, age, sex_binary_cat, education_recoded_cat) |>
  print_reactable(sorted_col = "country", width = 600)
# Merge weights into main data
nrow(df_final)
[1] 53799
df_gmh <- df_final |>
  dplyr::left_join( weights |> dplyr::select(ResponseId, ps_weight), by = "ResponseId")

nrow(df_gmh)
[1] 53799
# For a set of countries, recode the weight score to 1. Also recode NA to 1.
df_gmh <- df_gmh %>%
  mutate(ps_weight = base::ifelse(
    country %in% flagged_countries,
    1,
    ps_weight),
  ps_weight_flag = base::ifelse(
    country %in% flagged_countries,
    1, 0
  ),
  ps_weight_na = base::ifelse(
    country %in% flagged_countries,
    NA_real_, ps_weight
  ),
  ps_weight = base::ifelse(is.na(ps_weight), 1, ps_weight)
)

# Sanity check: How many missing values in weights after transforming those to 1?
df_gmh |>
  dplyr::summarise(
    n_missing_weights = base::sum(is.na(ps_weight_na)),
    perc_missing_weights = (n_missing_weights / dplyr::n()) * 100
  ) |> base::nrow()
[1] 1
# Cleanup
rm(weights)

A0.5. Calculate Factor Scores

Global Factor Scores

fit_mpwb <- lavaan::cfa(
  'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
  data = df_gmh,
  std.lv = TRUE,
  estimator = "MLR",
  sampling.weights = "ps_weight"
)

# We don't have any missing case on any of these variables
# and lavPredict keeps the same row order according to their manual
factor_scores <- lavaan::lavPredict(fit_mpwb, type = "lv")
df_gmh$mpwb_factor_global <- factor_scores[,1]

# View loadings
summary(fit_mpwb, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)
lavaan 0.6-19 ended normally after 15 iterations

  Estimator                                         ML
  Optimization method                           NLMINB
  Number of model parameters                        20

  Number of observations                          53799
  Sampling weights variable                   ps_weight

Model Test User Model:
                                               Standard      Scaled
  Test Statistic                              11658.139    4049.674
  Degrees of freedom                                 35          35
  P-value (Chi-square)                            0.000       0.000
  Scaling correction factor                                   2.879
    Yuan-Bentler correction (Mplus variant)                        

Model Test Baseline Model:

  Test statistic                            281635.351   94343.306
  Degrees of freedom                                45          45
  P-value                                        0.000       0.000
  Scaling correction factor                                  2.985

User Model versus Baseline Model:

  Comparative Fit Index (CFI)                    0.959       0.957
  Tucker-Lewis Index (TLI)                       0.947       0.945
                                                                  
  Robust Comparative Fit Index (CFI)                         0.959
  Robust Tucker-Lewis Index (TLI)                            0.947

Loglikelihood and Information Criteria:

  Loglikelihood user model (H0)            -840221.377 -840221.377
  Scaling correction factor                                  2.624
      for the MLR correction                                      
  Loglikelihood unrestricted model (H1)    -834392.308 -834392.308
  Scaling correction factor                                  2.786
      for the MLR correction                                      
                                                                  
  Akaike (AIC)                             1680482.755 1680482.755
  Bayesian (BIC)                           1680660.615 1680660.615
  Sample-size adjusted Bayesian (SABIC)    1680597.055 1680597.055

Root Mean Square Error of Approximation:

  RMSEA                                          0.079       0.046
  90 Percent confidence interval - lower         0.077       0.045
  90 Percent confidence interval - upper         0.080       0.047
  P-value H_0: RMSEA <= 0.050                    0.000       1.000
  P-value H_0: RMSEA >= 0.080                    0.025       0.000
                                                                  
  Robust RMSEA                                               0.078
  90 Percent confidence interval - lower                     0.076
  90 Percent confidence interval - upper                     0.080
  P-value H_0: Robust RMSEA <= 0.050                         0.000
  P-value H_0: Robust RMSEA >= 0.080                         0.092

Standardized Root Mean Square Residual:

  SRMR                                           0.031       0.031

Parameter Estimates:

  Standard errors                             Sandwich
  Information bread                           Observed
  Observed information based on                Hessian

Latent Variables:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
  mpwb =~                                                               
    mpwb_competenc    1.064    0.009  124.883    0.000    1.064    0.737
    mpwb_mtnl_stbl    1.123    0.008  139.341    0.000    1.123    0.743
    mpwb_engagemnt    0.773    0.009   85.772    0.000    0.773    0.577
    mpwb_meaning      1.127    0.009  130.748    0.000    1.127    0.757
    mpwb_optimism     1.213    0.008  145.537    0.000    1.213    0.763
    mpwb_postv_mtn    1.202    0.008  155.756    0.000    1.202    0.821
    mpwb_pstv_rltn    0.743    0.010   76.452    0.000    0.743    0.508
    mpwb_resilienc    0.970    0.008  114.688    0.000    0.970    0.654
    mpwb_self_estm    1.196    0.008  151.048    0.000    1.196    0.800
    mpwb_vitality     1.189    0.008  153.797    0.000    1.189    0.762

Variances:
                   Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
   .mpwb_competenc    0.951    0.012   80.689    0.000    0.951    0.457
   .mpwb_mtnl_stbl    1.023    0.012   84.746    0.000    1.023    0.448
   .mpwb_engagemnt    1.194    0.013   90.907    0.000    1.194    0.667
   .mpwb_meaning      0.949    0.012   76.064    0.000    0.949    0.428
   .mpwb_optimism     1.059    0.013   79.653    0.000    1.059    0.418
   .mpwb_postv_mtn    0.698    0.010   67.005    0.000    0.698    0.326
   .mpwb_pstv_rltn    1.583    0.017   95.070    0.000    1.583    0.742
   .mpwb_resilienc    1.262    0.013   99.437    0.000    1.262    0.573
   .mpwb_self_estm    0.804    0.011   71.741    0.000    0.804    0.360
   .mpwb_vitality     1.024    0.012   83.341    0.000    1.024    0.420
    mpwb              1.000                               1.000    1.000

R-Square:
                   Estimate
    mpwb_competenc    0.543
    mpwb_mtnl_stbl    0.552
    mpwb_engagemnt    0.333
    mpwb_meaning      0.572
    mpwb_optimism     0.582
    mpwb_postv_mtn    0.674
    mpwb_pstv_rltn    0.258
    mpwb_resilienc    0.427
    mpwb_self_estm    0.640
    mpwb_vitality     0.580
# Correlations (unweighted)
stats::cor(df_gmh$mpwb_factor_global, df_gmh$mpwb_sum) |> round(3)
[1] 0.995
stats::cor(df_gmh$mpwb_factor_global, df_gmh$mpwb_mean) |> round(3)
[1] 0.995
# Correlations (weighted)
weighted_corr(df_gmh, mpwb_factor_global, mpwb_sum)
      r        t     p
1 0.995 21956.04 <.001
weighted_corr(df_gmh, mpwb_factor_global, mpwb_mean)
      r        t     p
1 0.995 21874.82 <.001
# Sanity check: How many missing values in global factor scores?
df_gmh |>
  dplyr::filter(is.na(mpwb_factor_global)) |>
  base::nrow()
[1] 0
# Cleanup
rm(fit_mpwb, factor_scores)

Within Country Factor Scores

# Split data by country
country_list <- base::split(df_gmh, df_gmh$country)

# For each country we will fit CFA and extract scores
country_scores <- lapply(country_list, function(country_data) {

  fit <- lavaan::cfa(
    'mpwb =~ mpwb_competence + mpwb_emotional_stability + mpwb_engagement + mpwb_meaning + mpwb_optimism + mpwb_positive_emotion + mpwb_positive_relationships + mpwb_resilience + mpwb_self_esteem + mpwb_vitality',
    data = country_data,
    std.lv = TRUE,
    estimator = "MLR",
    sampling.weights = "ps_weight"
  )

  factor_scores <- lavaan::lavPredict(fit, type = "lv")[, 1]

  country_data$mpwb_factor_within <- factor_scores

  return(country_data)
})

# Recombine all countries
df_gmh <- dplyr::bind_rows(country_scores)

# Sanity check
df_gmh |>
  dplyr::group_by(country) |>
  dplyr::group_modify(~ {
    tibble::tibble(
      n = base::nrow(.x),
      mean_factor_within = base::round(base::mean(.x$mpwb_factor_within, na.rm = TRUE), 2),
      sd_factor_within = base::round(stats::sd(.x$mpwb_factor_within, na.rm = TRUE), 2),
      cor_mpwb_sum = base::round(
        stats::cor(.x$mpwb_factor_within, .x$mpwb_sum, use = "complete.obs"),
        3
      ),
      cor_mpwb_mean = base::round(
        stats::cor(.x$mpwb_factor_within, .x$mpwb_mean, use = "complete.obs"),
        3
      ),
      cor_mpwb_sum_wt = weighted_corr(.x, mpwb_factor_within, mpwb_sum)[[1]],
      cor_mpwb_mean_wt = weighted_corr(.x, mpwb_factor_within, mpwb_mean)[[1]]
    )
  }) |>
  dplyr::ungroup() |>
  print_reactable(sorted_col = "country", width = 500)
# Sanity check: How many missing values in factor scores?
df_gmh |>
  dplyr::filter(is.na(mpwb_factor_within)) |>
  base::nrow()
[1] 0
# Cleanup
rm(country_list, country_scores)

A0.6 Saving data

# Write labels from codebook to df_gmh
codebook_label <- codebook |>
  dplyr::select(variable, label) |>
  (\(x) { stats::setNames(x$label, x$variable) })()

for (v in names(codebook_label)) {
  labelled::var_label(df_gmh[[v]]) <- codebook_label[[v]]
}

# Save cleaned data
saveRDS(df_gmh, "999_cleaned_data.rds")
write.csv(df_gmh, "999_cleaned_data.csv", row.names = FALSE)

rm(v, codebook_label)

Information About the R Session

sessioninfo::session_info()
─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.5.2 (2025-10-31)
 os       macOS Sequoia 15.6
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       Europe/Lisbon
 date     2025-12-26
 pandoc   3.6.3 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
 quarto   1.4.549 @ /usr/local/bin/quarto

─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 package           * version    date (UTC) lib source
 abind               1.4-8      2024-09-12 [1] CRAN (R 4.5.0)
 archive             1.1.12     2025-03-20 [1] CRAN (R 4.5.0)
 askpass             1.2.1      2024-10-04 [1] CRAN (R 4.5.0)
 backports           1.5.0      2024-05-23 [1] CRAN (R 4.5.0)
 base64enc           0.1-3      2015-07-28 [1] CRAN (R 4.5.0)
 binom             * 1.1-1.1    2022-05-02 [1] CRAN (R 4.5.0)
 bit                 4.6.0      2025-03-06 [1] CRAN (R 4.5.0)
 bit64               4.6.0-1    2025-01-16 [1] CRAN (R 4.5.0)
 boot                1.3-32     2025-08-29 [1] CRAN (R 4.5.2)
 broom               1.0.9      2025-07-28 [1] CRAN (R 4.5.0)
 broom.mixed       * 0.2.9.6    2024-10-15 [1] CRAN (R 4.5.0)
 car               * 3.1-3      2024-09-27 [1] CRAN (R 4.5.0)
 carData           * 3.0-5      2022-01-06 [1] CRAN (R 4.5.0)
 cellranger          1.1.0      2016-07-27 [1] CRAN (R 4.5.0)
 checkmate           2.3.2      2024-07-29 [1] CRAN (R 4.5.0)
 class               7.3-23     2025-01-01 [1] CRAN (R 4.5.2)
 classInt            0.4-11     2025-01-08 [1] CRAN (R 4.5.0)
 cli                 3.6.5      2025-04-23 [1] CRAN (R 4.5.0)
 cluster             2.1.8.1    2025-03-12 [1] CRAN (R 4.5.2)
 coda                0.19-4.1   2024-01-31 [1] CRAN (R 4.5.0)
 codetools           0.2-20     2024-03-31 [1] CRAN (R 4.5.2)
 colorspace          2.1-1      2024-07-26 [1] CRAN (R 4.5.0)
 corrplot          * 0.95       2024-10-14 [1] CRAN (R 4.5.0)
 countrycode       * 1.6.1      2025-03-31 [1] CRAN (R 4.5.0)
 cowplot           * 1.2.0      2025-07-07 [1] CRAN (R 4.5.0)
 crayon              1.5.3      2024-06-20 [1] CRAN (R 4.5.0)
 crosstalk           1.2.1      2023-11-23 [1] CRAN (R 4.5.0)
 curl                7.0.0      2025-08-19 [1] CRAN (R 4.5.0)
 data.table          1.17.8     2025-07-10 [1] CRAN (R 4.5.0)
 DBI                 1.2.3      2024-06-02 [1] CRAN (R 4.5.0)
 digest              0.6.37     2024-08-19 [1] CRAN (R 4.5.0)
 dplyr             * 1.1.4      2023-11-17 [1] CRAN (R 4.5.0)
 e1071               1.7-16     2024-09-16 [1] CRAN (R 4.5.0)
 emmeans           * 1.11.2     2025-07-11 [1] CRAN (R 4.5.0)
 estimability        1.5.1      2024-05-12 [1] CRAN (R 4.5.0)
 evaluate            1.0.4      2025-06-18 [1] CRAN (R 4.5.0)
 farver              2.1.2      2024-05-13 [1] CRAN (R 4.5.0)
 fastmap             1.2.0      2024-05-15 [1] CRAN (R 4.5.0)
 flextable         * 0.9.10     2025-08-24 [1] CRAN (R 4.5.0)
 fontBitstreamVera   0.1.1      2017-02-01 [1] CRAN (R 4.5.0)
 fontLiberation      0.1.0      2016-10-15 [1] CRAN (R 4.5.0)
 fontquiver          0.2.1      2017-02-01 [1] CRAN (R 4.5.0)
 forcats           * 1.0.0      2023-01-29 [1] CRAN (R 4.5.0)
 foreach             1.5.2      2022-02-02 [1] CRAN (R 4.5.0)
 foreign             0.8-90     2025-03-31 [1] CRAN (R 4.5.2)
 Formula             1.2-5      2023-02-24 [1] CRAN (R 4.5.0)
 fs                  1.6.6      2025-04-12 [1] CRAN (R 4.5.0)
 furrr               0.3.1      2022-08-15 [1] CRAN (R 4.5.0)
 future              1.67.0     2025-07-29 [1] CRAN (R 4.5.0)
 gdata               3.0.1      2024-10-22 [1] CRAN (R 4.5.0)
 gdtools             0.4.4      2025-10-06 [1] CRAN (R 4.5.0)
 generics            0.1.4      2025-05-09 [1] CRAN (R 4.5.0)
 ggeffects         * 2.3.0      2025-06-13 [1] CRAN (R 4.5.0)
 ggflags           * 0.0.4      2023-10-10 [1] https://jimjam-slam.r-universe.dev (R 4.5.1)
 ggfx              * 1.0.2      2025-07-24 [1] CRAN (R 4.5.0)
 ggh4x             * 0.3.1      2025-05-30 [1] CRAN (R 4.5.0)
 ggplot2           * 4.0.0      2025-09-11 [1] CRAN (R 4.5.0)
 ggplotify         * 0.1.2      2023-08-09 [1] CRAN (R 4.5.0)
 ggridges          * 0.5.7      2025-08-27 [1] CRAN (R 4.5.0)
 ggtext            * 0.1.2      2022-09-16 [1] CRAN (R 4.5.0)
 glmnet              4.1-10     2025-07-17 [1] CRAN (R 4.5.0)
 globals             0.18.0     2025-05-08 [1] CRAN (R 4.5.0)
 glue                1.8.0      2024-09-30 [1] CRAN (R 4.5.0)
 gridExtra         * 2.3        2017-09-09 [1] CRAN (R 4.5.0)
 gridGraphics        0.5-1      2020-12-13 [1] CRAN (R 4.5.0)
 gridtext            0.1.5      2022-09-16 [1] CRAN (R 4.5.0)
 gt                  1.0.0      2025-04-05 [1] CRAN (R 4.5.0)
 gtable            * 0.3.6      2024-10-25 [1] CRAN (R 4.5.0)
 gtools              3.9.5      2023-11-20 [1] CRAN (R 4.5.0)
 haven               2.5.5      2025-05-30 [1] CRAN (R 4.5.0)
 Hmisc             * 5.2-4      2025-10-05 [1] CRAN (R 4.5.0)
 hms                 1.1.3      2023-03-21 [1] CRAN (R 4.5.0)
 htmlTable           2.4.3      2024-07-21 [1] CRAN (R 4.5.0)
 htmltools         * 0.5.8.1    2024-04-04 [1] CRAN (R 4.5.0)
 htmlwidgets         1.6.4      2023-12-06 [1] CRAN (R 4.5.0)
 httpuv              1.6.16     2025-04-16 [1] CRAN (R 4.5.0)
 insight             1.4.2      2025-09-02 [1] CRAN (R 4.5.0)
 interactions      * 1.2.0      2024-07-29 [1] CRAN (R 4.5.0)
 iterators           1.0.14     2022-02-05 [1] CRAN (R 4.5.0)
 janitor           * 2.2.1      2024-12-22 [1] CRAN (R 4.5.0)
 jomo                2.7-6      2023-04-15 [1] CRAN (R 4.5.0)
 jsonlite            2.0.0      2025-03-27 [1] CRAN (R 4.5.0)
 jtools              2.3.0      2024-08-25 [1] CRAN (R 4.5.0)
 kableExtra        * 1.4.0      2024-01-24 [1] CRAN (R 4.5.0)
 KernSmooth          2.23-26    2025-01-01 [1] CRAN (R 4.5.2)
 knitr               1.50       2025-03-16 [1] CRAN (R 4.5.0)
 labeling            0.4.3      2023-08-29 [1] CRAN (R 4.5.0)
 labelled          * 2.16.0     2025-10-22 [1] CRAN (R 4.5.0)
 later               1.4.2      2025-04-08 [1] CRAN (R 4.5.0)
 lattice             0.22-7     2025-04-02 [1] CRAN (R 4.5.2)
 lavaan            * 0.6-19     2024-09-26 [1] CRAN (R 4.5.0)
 leaflet           * 2.2.2      2024-03-26 [1] CRAN (R 4.5.0)
 leaflet.extras    * 2.0.1      2024-08-19 [1] CRAN (R 4.5.0)
 leaflet.extras2   * 1.3.2      2025-08-27 [1] CRAN (R 4.5.0)
 lifecycle           1.0.4      2023-11-07 [1] CRAN (R 4.5.0)
 listenv             0.9.1      2024-01-29 [1] CRAN (R 4.5.0)
 lme4              * 1.1-37     2025-03-26 [1] CRAN (R 4.5.0)
 lsr               * 0.5.2      2021-12-01 [1] CRAN (R 4.5.0)
 lubridate         * 1.9.4      2024-12-08 [1] CRAN (R 4.5.0)
 magick              2.8.7      2025-06-06 [1] CRAN (R 4.5.0)
 magrittr            2.0.4      2025-09-12 [1] CRAN (R 4.5.0)
 MASS                7.3-65     2025-02-28 [1] CRAN (R 4.5.2)
 mathjaxr            2.0-0      2025-12-01 [1] CRAN (R 4.5.2)
 Matrix            * 1.7-4      2025-08-28 [1] CRAN (R 4.5.2)
 metadat           * 1.4-0      2025-02-04 [1] CRAN (R 4.5.0)
 metafor           * 4.8-0      2025-01-28 [1] CRAN (R 4.5.0)
 MetBrewer         * 0.2.0      2022-03-21 [1] CRAN (R 4.5.0)
 mgcv              * 1.9-3      2025-04-04 [1] CRAN (R 4.5.2)
 mice                3.18.0     2025-05-27 [1] CRAN (R 4.5.0)
 mime                0.13       2025-03-17 [1] CRAN (R 4.5.0)
 minqa               1.2.8      2024-08-17 [1] CRAN (R 4.5.0)
 mitml               0.4-5      2023-03-08 [1] CRAN (R 4.5.0)
 mitools             2.4        2019-04-26 [1] CRAN (R 4.5.0)
 mnormt              2.1.1      2022-09-26 [1] CRAN (R 4.5.0)
 multcomp            1.4-28     2025-01-29 [1] CRAN (R 4.5.0)
 mvtnorm             1.3-3      2025-01-10 [1] CRAN (R 4.5.0)
 nlme              * 3.1-168    2025-03-31 [1] CRAN (R 4.5.2)
 nloptr              2.2.1      2025-03-17 [1] CRAN (R 4.5.0)
 nnet                7.3-20     2025-01-01 [1] CRAN (R 4.5.2)
 numDeriv          * 2016.8-1.1 2019-06-06 [1] CRAN (R 4.5.0)
 officer           * 0.7.0      2025-09-03 [1] CRAN (R 4.5.0)
 openssl             2.3.3      2025-05-26 [1] CRAN (R 4.5.0)
 pacman            * 0.5.1      2019-03-11 [1] CRAN (R 4.5.0)
 pagedown            0.23       2025-08-20 [1] CRAN (R 4.5.0)
 pan                 1.9        2023-12-07 [1] CRAN (R 4.5.0)
 pander              0.6.6      2025-03-01 [1] CRAN (R 4.5.0)
 parallelly          1.45.1     2025-07-24 [1] CRAN (R 4.5.0)
 pbivnorm            0.6.0      2015-01-23 [1] CRAN (R 4.5.0)
 performance       * 0.15.2     2025-10-06 [1] CRAN (R 4.5.0)
 pillar              1.11.0     2025-07-04 [1] CRAN (R 4.5.0)
 pkgconfig           2.0.3      2019-09-22 [1] CRAN (R 4.5.0)
 processx            3.8.6      2025-02-21 [1] CRAN (R 4.5.0)
 promises            1.3.3      2025-05-29 [1] CRAN (R 4.5.0)
 proxy               0.4-27     2022-06-09 [1] CRAN (R 4.5.0)
 ps                  1.9.1      2025-04-12 [1] CRAN (R 4.5.0)
 psych             * 2.5.6      2025-06-23 [1] CRAN (R 4.5.0)
 purrr             * 1.1.0      2025-07-10 [1] CRAN (R 4.5.0)
 quadprog            1.5-8      2019-11-20 [1] CRAN (R 4.5.0)
 qualtRics         * 3.2.1      2024-08-16 [1] CRAN (R 4.5.0)
 R6                  2.6.1      2025-02-15 [1] CRAN (R 4.5.0)
 ragg                1.4.0      2025-04-10 [1] CRAN (R 4.5.0)
 rappdirs            0.3.3      2021-01-31 [1] CRAN (R 4.5.0)
 rbibutils           2.3        2024-10-04 [1] CRAN (R 4.5.0)
 RColorBrewer        1.1-3      2022-04-03 [1] CRAN (R 4.5.0)
 Rcpp                1.1.0      2025-07-02 [1] CRAN (R 4.5.0)
 Rdpack              2.6.4      2025-04-09 [1] CRAN (R 4.5.0)
 reactable         * 0.4.4      2023-03-12 [1] CRAN (R 4.5.0)
 reactR              0.6.1      2024-09-14 [1] CRAN (R 4.5.0)
 readr             * 2.1.5      2024-01-10 [1] CRAN (R 4.5.0)
 readxl            * 1.4.5      2025-03-07 [1] CRAN (R 4.5.0)
 reformulas          0.4.1      2025-04-30 [1] CRAN (R 4.5.0)
 report            * 0.6.1      2025-02-07 [1] CRAN (R 4.5.0)
 rlang             * 1.1.6      2025-04-11 [1] CRAN (R 4.5.0)
 rmarkdown           2.29       2024-11-04 [1] CRAN (R 4.5.0)
 rmcorr            * 0.7.0      2024-07-26 [1] CRAN (R 4.5.0)
 rnaturalearth     * 1.1.0      2025-07-28 [1] CRAN (R 4.5.0)
 rnaturalearthdata * 1.0.0      2024-02-09 [1] CRAN (R 4.5.0)
 rpart               4.1.24     2025-01-07 [1] CRAN (R 4.5.2)
 rstudioapi          0.17.1     2024-10-22 [1] CRAN (R 4.5.0)
 S7                  0.2.0      2024-11-07 [1] CRAN (R 4.5.0)
 sandwich            3.1-1      2024-09-15 [1] CRAN (R 4.5.0)
 sass                0.4.10     2025-04-11 [1] CRAN (R 4.5.0)
 scales            * 1.4.0      2025-04-24 [1] CRAN (R 4.5.0)
 see               * 0.11.0     2025-03-11 [1] CRAN (R 4.5.0)
 semTools          * 0.5-7      2025-03-13 [1] CRAN (R 4.5.0)
 servr               0.32       2024-10-04 [1] CRAN (R 4.5.0)
 sessioninfo       * 1.2.3      2025-02-05 [1] CRAN (R 4.5.0)
 sf                * 1.0-21     2025-05-15 [1] CRAN (R 4.5.0)
 shape               1.4.6.1    2024-02-23 [1] CRAN (R 4.5.0)
 showtext          * 0.9-7      2024-03-02 [1] CRAN (R 4.5.0)
 showtextdb        * 3.0        2020-06-04 [1] CRAN (R 4.5.0)
 sjlabelled          1.2.0      2022-04-10 [1] CRAN (R 4.5.0)
 sjPlot            * 2.9.0      2025-07-10 [1] CRAN (R 4.5.0)
 snakecase           0.11.1     2023-08-27 [1] CRAN (R 4.5.0)
 stringi             1.8.7      2025-03-27 [1] CRAN (R 4.5.0)
 stringr           * 1.5.1      2023-11-14 [1] CRAN (R 4.5.0)
 survey            * 4.4-8      2025-08-28 [1] CRAN (R 4.5.0)
 survival          * 3.8-3      2024-12-17 [1] CRAN (R 4.5.2)
 svglite             2.2.1      2025-05-12 [1] CRAN (R 4.5.0)
 sysfonts          * 0.8.9      2024-03-02 [1] CRAN (R 4.5.0)
 systemfonts         1.3.1      2025-10-01 [1] CRAN (R 4.5.0)
 textshaping         1.0.1      2025-05-01 [1] CRAN (R 4.5.0)
 TH.data             1.1-3      2025-01-17 [1] CRAN (R 4.5.0)
 tibble            * 3.3.0      2025-06-08 [1] CRAN (R 4.5.0)
 tidyr             * 1.3.1      2024-01-24 [1] CRAN (R 4.5.0)
 tidyselect          1.2.1      2024-03-11 [1] CRAN (R 4.5.0)
 timechange          0.3.0      2024-01-18 [1] CRAN (R 4.5.0)
 tzdb                0.5.0      2025-03-15 [1] CRAN (R 4.5.0)
 units               0.8-7      2025-03-11 [1] CRAN (R 4.5.0)
 utf8                1.2.6      2025-06-08 [1] CRAN (R 4.5.0)
 uuid                1.2-1      2024-07-29 [1] CRAN (R 4.5.0)
 vctrs               0.6.5      2023-12-01 [1] CRAN (R 4.5.0)
 viridisLite         0.4.2      2023-05-02 [1] CRAN (R 4.5.0)
 visdat            * 0.6.0      2023-02-02 [1] CRAN (R 4.5.0)
 vroom               1.6.5      2023-12-05 [1] CRAN (R 4.5.0)
 websocket           1.4.4      2025-04-10 [1] CRAN (R 4.5.0)
 weights           * 1.1.2      2025-06-18 [1] CRAN (R 4.5.0)
 withr               3.0.2      2024-10-28 [1] CRAN (R 4.5.0)
 xfun                0.52       2025-04-02 [1] CRAN (R 4.5.0)
 xml2                1.3.8      2025-03-14 [1] CRAN (R 4.5.0)
 xtable              1.8-4      2019-04-21 [1] CRAN (R 4.5.0)
 yaml                2.3.10     2024-07-26 [1] CRAN (R 4.5.0)
 yulab.utils         0.2.1      2025-08-19 [1] CRAN (R 4.5.0)
 zip                 2.3.3      2025-05-13 [1] CRAN (R 4.5.0)
 zoo                 1.8-14     2025-04-10 [1] CRAN (R 4.5.0)

 [1] /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library
 * ── Packages attached to the search path.

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────